From 049d5583a5c320e3949bed04ab2329eb43a16be9 Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Wed, 27 Apr 2016 09:05:46 +0000
Subject: [PATCH 1/7] lib/checksum_simd.dasl: IP checksum in AVX2 assembler
 (prototype)

See source code comments for implementation status/notes.
---
 src/lib/checksum_simd.dasl | 111 +++++++++++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 src/lib/checksum_simd.dasl

diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl
new file mode 100644
index 0000000000..1aa9ff302a
--- /dev/null
+++ b/src/lib/checksum_simd.dasl
@@ -0,0 +1,111 @@
+-- checksum_simd: SIMD checksum routines
+-- Use of this source code is governed by the Apache 2.0 license; see COPYING.
+
+module(..., package.seeall)
+
+local dasm = require("dasm")
+local ffi = require("ffi")
+local C = ffi.C
+
+|.arch x64
+|.actionlist actions
+|.globalnames globalnames
+
+-- Calculate checksum using AVX2 vector instructions. Optimized for
+-- both speed and short/simple code.
+--
+-- Algorithm:
+--
+-- Operate on 32-byte chunks:
+--   Divide each chunk into 16xU16 values.
+--   Sum 1st 8xU16 into 8xU32 accumulators
+--   Sum 2nd 8xU16 into 8xU32 accumulators
+-- "Fold" accumulators from 8xU32 down to the 1xU16 result.
+--
+-- The input is temporarily padded with a 32-byte "trailer" block of
+-- zeros at the end. This means there is no "remainder" to worry about.
+-- This requires input pointers for which this is safe (e.g. 'struct
+-- packet').
+--
+-- TODO:
+--   Check carry/overflow conditions more closely. Currently not
+--   correct and possibly in a fatal way. Define boundaries e.g.
+--   maximum safe input length.
+--
+--   Check for a better/faster idea/method than zero-padding the end.
+--   Writing 32-bytes to an unaligned address seems to cost 20 cycles
+--   both with vector (VMOVDQU) and with integer (4 x MOVQ) moves.
+--
+-- This algorithm is a new formulation based on:
+--   Tony Rogvall's C intrinsics code (Snabb arch/avx2.c) and ideas.
+--   RFC 1071 "Computing the Internet Checksum"
+--   Article http://locklessinc.com/articles/tcp_checksum/
+--   Article https://www.klittlepage.com/2013/12/10/accelerated-fix-processing-via-avx2-vector-instructions/ 
+
+function asm_checksum_avx2 (Dst)
+      |->checksum_avx2:
+      | add rsi, rdi                 -- rsi = data pointer, rdi = end address
+      | vpxor ymm0, ymm0, ymm0       -- ymm0 = 0 (useful constant zero)
+      | vpxor ymm1, ymm1, ymm1       -- ymm1 = 0 (8 x u32 accumulators)
+      | vmovdqu ymm6, [rsi]          -- Save the "trailer" 32-bytes
+      | vmovdqu [rsi], ymm0          -- Overwrite the trailer with zeros
+      |1:
+      | vmovdqu ymm2, [rdi]          -- Load 32-byte chunk
+      | vpunpcklwd ymm3, ymm2, ymm0  -- Unpack low 8*u16 values into 8*u32 vector
+      | vpaddd     ymm1, ymm1, ymm3  -- .. sum into 8*u32 accumulator
+      | vpunpckhwd ymm3, ymm2, ymm0  -- Repeat with high 8*u16 values
+      | vpaddd     ymm1, ymm1, ymm3  -- ...
+      | add rdi, 0x20                -- Advance input pointer
+      | cmp rdi, rsi                 -- Check for end of input
+      | jl <1
+      -- Finish up:
+      | vmovdqu [rsi], ymm6          -- Restore the "trailer"
+      | vphaddd ymm1, ymm1, ymm0     -- Fold from 8xU32 to 4xU32 accumulators
+      | vphaddd ymm1, ymm1, ymm0     -- Fold from 4xU32 to 2xU32 accumulators
+      | vextracti128 xmm2, ymm1, 1   -- Separate remaining accumulators
+      | vpaddw xmm1, xmm1, xmm2      -- Fold into 1xU32 accumulator
+      | vmovd edx, xmm1              -- Extract U32 accumulator into integer reg
+      | movzx rax, dx                -- Get low 16 bits in dx
+      | shr edx, 16                  -- Get high 16 bits in ax
+      | add ax, dx                   -- Sum into U16 accumulator
+      | adc ax, 0                    -- Add carry
+      | xchg al, ah                  -- Swap to network byte order
+      | ret
+end
+
+local Dst, globals = dasm.new(actions, nil, nil, 1 + #globalnames)
+asm_checksum_avx2(Dst)
+local mcode, size = Dst:build()
+local entry = dasm.globals(globals, globalnames)
+
+cksum = ffi.cast("int(*)(void *, int)", entry.checksum_avx2)
+
+_anchor = mcode
+
+dasm.dump(mcode, size)
+
+function selftest ()
+   require("lib.checksum")
+   local pmu = require("lib.pmu")
+   local sz = 32
+   local s = ffi.new("char["..sz.."]")
+   for i = 0, sz do
+      s[i] = i % 256
+   end
+   local n = 1e6
+   print("asm")
+   -- Simple benchmark vs C intrinsics code.
+   -- Tweak value of 'sz' to see different sizes.
+   if pmu.is_available() then
+      pmu.profile(function () for i = 1, n do cksum(s, sz) end end,
+            {}, {call=n, byte=sz*n, block=sz*n/32})
+      print("c")
+      pmu.profile(function () for i = 1, n do C.cksum_avx2(s, sz, 0) end end,
+            {}, {call=n, byte=sz*n, block=sz*n/32})
+   end
+   local v = cksum(s, sz-1)
+   print("value", v, bit.tohex(v))
+   local r = bit.bxor(0xffff, require("lib.checksum").ipsum(s, sz-1, 0))
+   print("ref", r, bit.tohex(r))
+end
+

From 4993c37ea3fbf2d580dd8cf458b7faf79a47dd2b Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Thu, 28 Apr 2016 04:25:35 +0000
Subject: [PATCH 2/7] lib.checksum_simd: Fix overflow bug - now works

Fixes an overflow bug where the 32-bit accumulators were summed using
a 16-bit add instruction. Checksums now seem to be correct (same as
existing routine) for up to 128KB inputs.
---
 src/lib/checksum_simd.dasl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl
index 1aa9ff302a..7125b053ae 100644
--- a/src/lib/checksum_simd.dasl
+++ b/src/lib/checksum_simd.dasl
@@ -63,7 +63,7 @@ function asm_checksum_avx2 (Dst)
       | vphaddd ymm1, ymm1, ymm0     -- Fold from 8xU32 to 4xU32 accumulators
       | vphaddd ymm1, ymm1, ymm0     -- Fold from 4xU32 to 2xU32 accumulators
       | vextracti128 xmm2, ymm1, 1   -- Separate remaining accumulators
-      | vpaddw xmm1, xmm1, xmm2      -- Fold into 1xU32 accumulator
+      | vpaddd xmm1, xmm1, xmm2      -- Fold into 1xU32 accumulator
       | vmovd edx, xmm1              -- Extract U32 accumulator into integer reg
       | movzx rax, dx                -- Get low 16 bits in dx
       | shr edx, 16                  -- Get high 16 bits in ax

From 14d78e489e256691baa97bd6a99f0949ac5f8ca8 Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Thu, 28 Apr 2016 04:27:55 +0000
Subject: [PATCH 3/7] lib/checksum_simd.dasl: Improved comments and selftest

---
 src/lib/checksum_simd.dasl | 44 ++++++++++++++++++++++----------------
 1 file changed, 26 insertions(+), 18 deletions(-)

diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl
index 7125b053ae..3b8879e21e 100644
--- a/src/lib/checksum_simd.dasl
+++ b/src/lib/checksum_simd.dasl
@@ -28,13 +28,10 @@ local C = ffi.C
 -- packet').
 --
 -- TODO:
---   Check carry/overflow conditions more closely. Currently not
---   correct and possibly in a fatal way. Define boundaries e.g.
---   maximum safe input length.
---
---   Check for a better/faster idea/method than zero-padding the end.
---   Writing 32-bytes to an unaligned address seems to cost 20 cycles
---   both with vector (VMOVDQU) and with integer (4 x MOVQ) moves.
+--   Confirm maximum input size before overflow. Empirically when
+--   checksumming an all-ones array I see the first overflow (checksum
+--   mismatch compared with reference) at 128KB. That would be more
+--   than sufficient for network packets.
 --
 -- This algorithm is a new formulation based on:
 --   Tony Rogvall's C intrinsics code (Snabb arch/avx2.c) and ideas.
@@ -65,8 +62,8 @@ function asm_checksum_avx2 (Dst)
       | vextracti128 xmm2, ymm1, 1   -- Separate remaining accumulators
       | vpaddd xmm1, xmm1, xmm2      -- Fold into 1xU32 accumulator
       | vmovd edx, xmm1              -- Extract U32 accumulator into integer reg
-      | movzx rax, dx                -- Get low 16 bits in dx
-      | shr edx, 16                  -- Get high 16 bits in ax
+      | movzx rax, dx                -- Get low 16 bits in rdx
+      | shr edx, 16                  -- Get high 16 bits in dx
       | add ax, dx                   -- Sum into U16 accumulator
       | adc ax, 0                    -- Add carry
       | xchg al, ah                  -- Swap to network byte order
@@ -82,30 +79,41 @@ cksum = ffi.cast("int(*)(void *, int)", entry.checksum_avx2)
 
 _anchor = mcode
 
-dasm.dump(mcode, size)
+--dasm.dump(mcode, size)
 
 function selftest ()
+   print("selftest: checksum_simd")
    require("lib.checksum")
    local pmu = require("lib.pmu")
-   local sz = 32
+   local sz = 10*1024
    local s = ffi.new("char["..sz.."]")
    for i = 0, sz do
       s[i] = i % 256
    end
-   local n = 1e6
-   print("asm")
+   local n = 1e5
    -- Simple benchmark vs C intrinsics code.
    -- Tweak value of 'sz' to see different sizes.
    if pmu.is_available() then
+      print("ASM")
       pmu.profile(function () for i = 1, n do cksum(s, sz) end end,
             {}, {call=n, byte=sz*n, block=sz*n/32})
-      print("c")
+      print()
+      print("C")
       pmu.profile(function () for i = 1, n do C.cksum_avx2(s, sz, 0) end end,
             {}, {call=n, byte=sz*n, block=sz*n/32})
+   else
+      local _, reason = pmu.is_available()
+      print("Skipping benchmark. PMU not available: " .. reason)
+   end
+   -- Compare values for all input sizes
+   for i = 1, sz do
+      local v = cksum(s, sz-1)
+      local r = bit.bxor(0xffff, require("lib.checksum").ipsum(s, sz-1, 0))
+      if v ~= r then
+         print(i, bit.tohex(v), bit.tohex(r))
+         error("checksum mismatch")
+      end
    end
-   local v = cksum(s, sz-1)
-   print("value", v, bit.tohex(v))
-   local r = bit.bxor(0xffff, require("lib.checksum").ipsum(s, sz-1, 0))
-   print("ref", r, bit.tohex(r))
+   print("selftest: ok")
 end
 

From 999807df97181a72ba72055dfd8ee9554288d7c9 Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Sat, 30 Apr 2016 08:57:30 +0000
Subject: [PATCH 4/7] lib.checksum_simd: Add VZEROUPPER before RET

From the comment:

    This routine executes a VZEROUPPER instruction before returning in
    order to flush 256-bit AVX register state and avoid potential
    expensive SSE-AVX transition penalties. This is a cheap form of
    insurance against taking ~ 75 cycle penalties when mixing SSE and
    AVX code in the same program. See
    https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
    and particularly section 3.3.
---
 src/lib/checksum_simd.dasl | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl
index 3b8879e21e..ad57b6b6cd 100644
--- a/src/lib/checksum_simd.dasl
+++ b/src/lib/checksum_simd.dasl
@@ -27,6 +27,14 @@ local C = ffi.C
 -- This requires input pointers for which this is safe (e.g. 'struct
 -- packet').
 --
+-- This routine executes a VZEROUPPER instruction before returning in
+-- order to flush 256-bit AVX register state and avoid potential
+-- expensive SSE-AVX transition penalties. This is a cheap form of
+-- insurance against taking ~ 75 cycle penalties when mixing SSE and
+-- AVX code in the same program. See
+-- https://software.intel.com/en-us/articles/avoiding-avx-sse-transition-penalties
+-- and particularly section 3.3.
+--
 -- TODO:
 --   Confirm maximum input size before overflow. Empirically when
 --   checksumming an all-ones array I see the first overflow (checksum
@@ -67,6 +75,7 @@ function asm_checksum_avx2 (Dst)
       | add ax, dx                   -- Sum into U16 accumulator
       | adc ax, 0                    -- Add carry
       | xchg al, ah                  -- Swap to network byte order
+      | vzeroupper                   -- Avoid potential AVX-SSE transition penalty
       | ret
 end
 

From 9ea803140c16fa9b99b834cdd5cfbee8741f17df Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Sat, 30 Apr 2016 16:46:42 +0000
Subject: [PATCH 5/7] lib.checksum_simd: Add support for 'initial' argument

The assembler routine now has the same function interface as the C
functions and should be able to serve as a drop-in replacement.
---
 src/lib/checksum_simd.dasl | 28 ++++++++++++++++++----------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/lib/checksum_simd.dasl b/src/lib/checksum_simd.dasl
index ad57b6b6cd..15a8dd1a47 100644
--- a/src/lib/checksum_simd.dasl
+++ b/src/lib/checksum_simd.dasl
@@ -47,9 +47,11 @@ local C = ffi.C
 --   Article http://locklessinc.com/articles/tcp_checksum/
 --   Article https://www.klittlepage.com/2013/12/10/accelerated-fix-processing-via-avx2-vector-instructions/ 
 
+-- Generate the function:
+--   uint16_t asm_checksum_avx2(char *ptr, int length, uint16_t initial)
 function asm_checksum_avx2 (Dst)
       |->checksum_avx2:
-      | add rsi, rdi                 -- rsi = data pointer, rdi = end address
+      | add rsi, rdi                 -- rsi = end address, rdi = data pointer
       | vpxor ymm0, ymm0, ymm0       -- ymm0 = 0 (useful constant zero)
       | vpxor ymm1, ymm1, ymm1       -- ymm1 = 0 (8 x u32 accumulators)
       | vmovdqu ymm6, [rsi]          -- Save the "trailer" 32-bytes
@@ -69,12 +71,15 @@ function asm_checksum_avx2 (Dst)
       | vphaddd ymm1, ymm1, ymm0     -- Fold from 4xU32 to 2xU32 accumulators
       | vextracti128 xmm2, ymm1, 1   -- Separate remaining accumulators
       | vpaddd xmm1, xmm1, xmm2      -- Fold into 1xU32 accumulator
-      | vmovd edx, xmm1              -- Extract U32 accumulator into integer reg
-      | movzx rax, dx                -- Get low 16 bits in rdx
-      | shr edx, 16                  -- Get high 16 bits in dx
-      | add ax, dx                   -- Sum into U16 accumulator
+      | vmovd ecx, xmm1              -- Extract U32 accumulator into integer reg
+      | movzx rax, cx                -- Get low 16 bits in rdx
+      | shr ecx, 16                  -- Get high 16 bits in dx
+      | add ax, cx                   -- Sum into U16 accumulator
       | adc ax, 0                    -- Add carry
       | xchg al, ah                  -- Swap to network byte order
+      | add ax, dx                   -- Add 'initial' argument (edx = arg 3)
+      | adc ax, 0                    -- Add carry
+      | xor ax, 0xffff               -- Convert to one's complement
       | vzeroupper                   -- Avoid potential AVX-SSE transition penalty
       | ret
 end
@@ -84,12 +89,13 @@ asm_checksum_avx2(Dst)
 local mcode, size = Dst:build()
 local entry = dasm.globals(globals, globalnames)
 
-cksum = ffi.cast("int(*)(void *, int)", entry.checksum_avx2)
+cksum = ffi.cast("uint16_t(*)(unsigned char *, size_t, uint16_t)", entry.checksum_avx2)
 
 _anchor = mcode
 
 --dasm.dump(mcode, size)
 
+-- See also 'snabbmark checksum' command
 function selftest ()
    print("selftest: checksum_simd")
    require("lib.checksum")
@@ -103,12 +109,13 @@ function selftest ()
    -- Simple benchmark vs C intrinsics code.
    -- Tweak value of 'sz' to see different sizes.
    if pmu.is_available() then
+      local rand = math.random(256*256)
       print("ASM")
-      pmu.profile(function () for i = 1, n do cksum(s, sz) end end,
+      pmu.profile(function () for i = 1, n do cksum(s, sz, rand) end end,
             {}, {call=n, byte=sz*n, block=sz*n/32})
       print()
       print("C")
-      pmu.profile(function () for i = 1, n do C.cksum_avx2(s, sz, 0) end end,
+      pmu.profile(function () for i = 1, n do C.cksum_avx2(s, sz, rand) end end,
             {}, {call=n, byte=sz*n, block=sz*n/32})
    else
       local _, reason = pmu.is_available()
@@ -116,8 +123,9 @@ function selftest ()
    end
    -- Compare values for all input sizes
    for i = 1, sz do
-      local v = cksum(s, sz-1)
-      local r = bit.bxor(0xffff, require("lib.checksum").ipsum(s, sz-1, 0))
+      local rand = math.random(256*256)
+      local v = cksum(s, sz-1, rand)
+      local r = require("lib.checksum").ipsum(s, sz-1, rand)
       if v ~= r then
          print(i, bit.tohex(v), bit.tohex(r))
          error("checksum mismatch")

From 2674a1e26e99c8089343cf04975eeadaec890af8 Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Sat, 30 Apr 2016 16:55:55 +0000
Subject: [PATCH 6/7] snabbmark checksum: New benchmark

snabbmark can now measure the performance of built-in IP checksum
routines and presents results for apples-to-apples comparison.

The benchmark parameters are currently hard-coded. Length is randomly
chosen from a "log uniform" distribution (favoring smaller values but
drawn from a large range). Alignment is randomized.

The intention is to favor robust routines that are not sensitive to
alignment and predictable branches.

Currently the alignment is forced to be even. Initially this was to be
realistic for normal protocols but I discovered that odd addresses
actually crash the SSE implementation. Have to address that bug
separately.
---
 src/program/snabbmark/README        |  5 +++
 src/program/snabbmark/snabbmark.lua | 60 +++++++++++++++++++++++++++++
 2 files changed, 65 insertions(+)

diff --git a/src/program/snabbmark/README b/src/program/snabbmark/README
index 76b010e342..327e4c2abc 100644
--- a/src/program/snabbmark/README
+++ b/src/program/snabbmark/README
@@ -36,3 +36,8 @@ Usage:
 
     Example usage with 10 million packets, packet size 128 bytes:
     sudo SNABB_PCI0="0000:02:00.0"  SNABB_PCI1="0000:03:00.0" ./snabb snabbmark intel1g 10e6 128
+
+  snabbmark checksum
+    Compare performance of built-in IP checksum implementations.
+    (Requires CPU AVX2 support.)
+
diff --git a/src/program/snabbmark/snabbmark.lua b/src/program/snabbmark/snabbmark.lua
index 8a554ffa8b..990eb28487 100644
--- a/src/program/snabbmark/snabbmark.lua
+++ b/src/program/snabbmark/snabbmark.lua
@@ -22,6 +22,8 @@ function run (args)
       solarflare(unpack(args))
    elseif command == 'intel1g' and #args >= 2 and #args <= 3 then
       intel1g(unpack(args))
+   elseif command == 'checksum' then
+      checksum(args)
    else
       print(usage) 
       main.exit(1)
@@ -328,3 +330,61 @@ receive_device.interface= "rx1GE"
       main.exit(1)
    end
 end
+
+-- Checksum benchmark
+
+function checksum1 (size_min, size_max, verbose)
+   local loops = 1000
+   local inputs = 1000
+   local sizes  = {}
+   local arrays = {}
+   local bytes = 0
+   for i = 1, inputs do
+      -- Random sizes up to 10K from a "log uniform" distribution i.e.
+      -- proportionally more smaller values.
+      local size = size_min + math.floor(math.exp(math.log(size_max-size_min)*math.random()))
+      sizes[i] = size
+      bytes = bytes + size
+      -- Use even but otherwise random alignment.
+      -- XXX odd alignment breaks SSE2 checksum -- fix separately!
+      local align = math.random(32) * 2
+      arrays[i] = ffi.new("char[?]", size + align) + align
+      -- Fill with random data
+      for j = 0, size-1 do
+         arrays[i][j] = math.random(256)
+      end
+   end
+   local pmu = require("lib.pmu")
+ simd = require("lib.checksum_simd")
+   local checksum = require("lib.checksum")
+   local cksum = function (f)
+      return function ()
+         for i = 1, loops do
+            for i = 1, inputs do
+               f(arrays[i], sizes[i], 0)
+            end
+         end
+      end
+   end
+   local r = {}
+   local pmu_aux = {byte=bytes*loops, call=inputs*loops}
+   local pmu_events = {}
+   _, r.asm  = pmu.measure(cksum(simd.cksum),      pmu_events, pmu_aux)
+   _, r.avx2 = pmu.measure(cksum(C.cksum_avx2),    pmu_events, pmu_aux)
+   _, r.sse2 = pmu.measure(cksum(C.cksum_sse2),    pmu_events, pmu_aux)
+   _, r.base  = pmu.measure(cksum(C.cksum_generic), pmu_events, pmu_aux)
+   print(("%-14s %14s %14s %14s"):format("VARIANT", "BYTES/PACKET", "BYTES/CYCLE", "CYCLES/PACKET"))
+   local totalbytes = bytes * loops
+   for variant, result in pairs(r) do
+      local bpp = bytes / inputs
+      local bpc = totalbytes / result.cycles
+      local cpp = result.cycles / (inputs * loops)
+      print(("%-14s %14.3f %14.3f %14.3f"):format(variant, bpp, bpc, cpp))
+      if verbose then pmu.report(result, pmu_aux) print() end
+   end
+end
+
+function checksum (args)
+   -- XXX add a useful command-line syntax
+   checksum1(20, 5000, false)
+end

From 3c1bb619bb7f5fb3dea7602a3900be739d321cfb Mon Sep 17 00:00:00 2001
From: Luke Gorrie <luke@snabb.co>
Date: Sun, 1 May 2016 07:24:16 +0000
Subject: [PATCH 7/7] lib.checksum: Rework C implementations

Before there were three separate C checksum implementations (generic,
SSE, AVX) that are each compiled with different compiler settings. These
were fairly complex due to SIMD intrinsics. The SSE implementation was
also incorrect and would segfault with odd-numbered addresses.

Now there is one C checksum routine that is compiled with two different
compiler settings (default/SSE and AVX). The checksum routine is written
in a very simple style that GCC successfully vectorizes
automatically (tested with GCC 4.8.5, 4.9.3, 5.3.0).

I experimented with "waving a voodoo chicken" in a few different
ways (# accumulators = 1, 2, 4; accumulator size = 32bit, 64bit). This
formulation seems to work best for GCC. This does feel like hokus-pokus
that exposes us to GCC behavior that is not nailed down, but that
bothers me less than the high-brow intrinsics code.

I have retained the AVX2 assembler implementation with DynASM because I
have not been able to beat that with GCC yet.

Current scoreboard:

    VARIANT          BYTES/PACKET    BYTES/CYCLE  CYCLES/PACKET
    base                  631.331          2.939        214.796
    asm                   631.331          4.161        151.719
    avx2                  631.331          3.416        184.825
---
 src/Makefile                             | 13 ++--
 src/arch/avx2.c                          | 84 ---------------------
 src/arch/checksum.c                      | 42 +++++++++++
 src/arch/sse2.c                          | 94 ------------------------
 src/lib/checksum.h                       |  9 +--
 src/lib/checksum.lua                     | 17 +----
 src/lib/{checksum.c => checksum_extra.c} | 66 +----------------
 src/program/snabbmark/snabbmark.lua      |  5 +-
 8 files changed, 60 insertions(+), 270 deletions(-)
 delete mode 100644 src/arch/avx2.c
 create mode 100644 src/arch/checksum.c
 delete mode 100644 src/arch/sse2.c
 rename src/lib/{checksum.c => checksum_extra.c} (65%)

diff --git a/src/Makefile b/src/Makefile
index 831d6fc60c..0d405a23f5 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -15,7 +15,6 @@ PFLUASRC = $(shell cd ../lib/pflua/src && \
 CSRC   = $(shell find . -regex '[^\#]*\.c' -not -regex './arch/.*' -printf '%P ')
 CHDR   = $(shell find . -regex '[^\#]*\.h' -printf '%P ')
 ASM    = $(shell find . -regex '[^\#]*\.dasl' -printf '%P ')
-ARCHSRC= $(shell find . -regex '^./arch/[^\#]*\.c' -printf '%P ')
 RMSRC  = $(shell find . -name '*.md' -not -regex './obj.*' -printf '%P ')
 # regexp is to include program/foo but not program/foo/bar
 PROGRAM = $(shell find program -regex '^[^/]+/[^/]+' -type d -printf '%P ')
@@ -26,7 +25,7 @@ LUAOBJ := $(patsubst %.lua,obj/%_lua.o,$(LUASRC))
 PFLUAOBJ := $(patsubst %.lua,obj/%_lua.o,$(PFLUASRC))
 COBJ   := $(patsubst %.c,obj/%_c.o,    $(CSRC))
 HOBJ   := $(patsubst %.h,obj/%_h.o,    $(CHDR))
-ARCHOBJ:= $(patsubst %.c,obj/%_c.o,    $(ARCHSRC))
+ARCHOBJ:= obj/arch/checksum_c.o obj/arch/checksum_avx2_c.o
 ASMOBJ := $(patsubst %.dasl,obj/%_dasl.o,   $(ASM))
 JITOBJS:= $(patsubst %,obj/jit_%.o,$(JITSRC))
 EXTRAOBJS := obj/jit_tprof.o obj/jit_vmprof.o obj/strict.o
@@ -126,13 +125,13 @@ $(COBJ): obj/%_c.o: %.c $(CHDR) Makefile | $(OBJDIR)
 	$(E) "C         $@"
 	$(Q) gcc $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $<
 
-obj/arch/avx2_c.o: arch/avx2.c Makefile
+obj/arch/checksum_avx2_c.o: arch/checksum.c Makefile
 	$(E) "C(AVX2)   $@"
-	$(Q) gcc -O2 -mavx2 $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $<
+	$(Q) gcc -Dcksum=cksum_avx2 -O3 -mavx2 $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $<
 
-obj/arch/sse2_c.o: arch/sse2.c Makefile
-	$(E) "C(SSE2)   $@"
-	$(Q) gcc -O2 -msse2 $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $<
+obj/arch/checksum_c.o: arch/checksum.c Makefile
+	$(E) "C(SSE4)   $@"
+	$(Q) gcc -msse4 -O3 $(DEBUG) -Wl,-E -I ../lib/luajit/src -I . -include $(CURDIR)/../gcc-preinclude.h -c -Wall -Werror -o $@ $<
 
 $(HOBJ): obj/%_h.o: %.h Makefile | $(OBJDIR)
 	$(E) "H         $@"
diff --git a/src/arch/avx2.c b/src/arch/avx2.c
deleted file mode 100644
index fce7e78a0e..0000000000
--- a/src/arch/avx2.c
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Use of this source code is governed by the Apache 2.0 license; see COPYING.
- * Based on original SSE2 code by Tony Rogvall that is
- * copyright 2011 Teclo Networks AG. MIT licensed by Juho Snellman. */
-
-/* IP checksum routine for AVX2. */
-
-#include <stdint.h>
-#include <arpa/inet.h>
-#include <x86intrin.h>
-#include "lib/checksum.h"
-#include "lib/checksum_lib.h"
-
-static inline uint32_t cksum_avx2_loop(unsigned char *p, size_t n)
-{
-    __m256i sum0, sum1, zero;
-    uint32_t s[8] __attribute__((aligned(32))); // aligned for avx2 store
-    uint32_t sum2;
-
-    zero = _mm256_set_epi64x(0,0,0,0);
-    sum0 = zero;
-    sum1 = zero;
-
-    while(n) {
-        size_t k = (n >= 0xff) ? 0xff : n;
-        __m256i t0,t1;
-        __m256i s0 = zero;
-        __m256i s1 = zero;
-        n -= k;
-        while (k) {
-            __m256i src = _mm256_loadu_si256((__m256i const*) p);
-            __m256i t;
-
-            t = _mm256_unpacklo_epi8(src, zero);
-            s0 = _mm256_adds_epu16(s0, t);
-            t = _mm256_unpackhi_epi8(src, zero);
-            s1 = _mm256_adds_epu16(s1, t);
-            p += sizeof(src);
-            k--;
-        }
-
-        // LOW - combine S0 and S1 into sum0
-        t0 = _mm256_unpacklo_epi16(s0, zero);
-        sum0 = _mm256_add_epi32(sum0, t0);
-        t1 = _mm256_unpacklo_epi16(s1, zero);
-        sum1 = _mm256_add_epi32(sum1, t1);
-
-        // HIGH - combine S0 and S1 into sum1
-        t0 = _mm256_unpackhi_epi16(s0, zero);
-        sum0 = _mm256_add_epi32(sum0, t0);
-        t1 = _mm256_unpackhi_epi16(s1, zero);
-        sum1 = _mm256_add_epi32(sum1, t1);
-    }
-    // here we must sum the 4-32 bit sums into one 32 bit sum
-    _mm256_store_si256((__m256i*)s, sum0);
-    sum2 = (s[0]<<8) + s[1] + (s[2]<<8) + s[3] + (s[4]<<8) + s[5] + (s[6]<<8) + s[7];
-    _mm256_store_si256((__m256i*)s, sum1);
-    sum2 += (s[0]<<8) + s[1] + (s[2]<<8) + s[3] + (s[4]<<8) + s[5] + (s[6]<<8) + s[7];
-
-    return sum2;
-}
-
-uint16_t cksum_avx2(unsigned char *p, size_t n, uint16_t initial)
-{
-    uint32_t sum = initial;
-
-    if (n < 128) { return cksum_generic(p, n, initial); }
-    if (n >= 64) {
-        size_t k = (n >> 5);
-        sum += cksum_avx2_loop(p, k);
-        n -= (32*k);
-        p += (32*k);
-    }
-    if (n > 1) {
-        size_t k = (n>>1);   // number of 16-bit words
-        sum += cksum_ua_loop(p, k);
-        n -= (2*k);
-        p += (2*k);
-    }
-    if (n)       // take care of left over byte
-        sum += (p[0] << 8);
-    while(sum>>16)
-        sum = (sum & 0xFFFF) + (sum>>16);
-    return (uint16_t)~sum;
-}
diff --git a/src/arch/checksum.c b/src/arch/checksum.c
new file mode 100644
index 0000000000..458c66efd1
--- /dev/null
+++ b/src/arch/checksum.c
@@ -0,0 +1,42 @@
+/* Use of this source code is governed by the Apache 2.0 license; see COPYING. */
+/* IP checksum routines. */
+
+#include <arpa/inet.h>
+#include <stdio.h>
+#include <stddef.h>
+#include <string.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <sys/time.h>
+
+uint16_t cksum(unsigned char *p, size_t len, uint16_t initial)
+{
+  uint64_t sum = htons(initial);
+  uint64_t sum1 = 0;
+  const uint32_t *u32 = (const uint32_t *)p;
+
+  while (len >= (sizeof(*u32) * 2)) {
+    sum  += u32[0];
+    sum1 += u32[1];
+    u32 += 2;
+    len -= sizeof(*u32) * 2;
+  }
+  sum += sum1;
+
+  const uint16_t *u16 = (const uint16_t *)u32;
+  while (len >= sizeof(*u16)) {
+    sum += *u16;
+    len -= sizeof(*u16);
+    u16 += 1;
+  }
+
+  /* if length is in odd bytes */
+  if (len == 1)
+    sum += *((const uint8_t *)u16);
+
+  while(sum>>16)
+    sum = (sum & 0xFFFF) + (sum>>16);
+  return ntohs((uint16_t)~sum);
+}
+
+
diff --git a/src/arch/sse2.c b/src/arch/sse2.c
deleted file mode 100644
index f98878e871..0000000000
--- a/src/arch/sse2.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/* Use of this source code is governed by the Apache 2.0 license; see COPYING.
- * Original code by Tony Rogvall that is
- * copyright 2011 Teclo Networks AG. MIT licensed by Juho Snellman. */
-
-/* IP checksum routine for SSE2. */
-
-#include <stdint.h>
-#include <arpa/inet.h>
-#include <x86intrin.h>
-#include "lib/checksum.h"
-#include "lib/checksum_lib.h"
-
-//
-// this loop may only run when data is aligned 16 byte aligned
-// n is number of 16 byte vectors
-//
-static inline uint32_t cksum_sse2_loop(unsigned char *p, size_t n)
-{
-  __m128i sum0, sum1, zero;
-  uint32_t s[4];
-  uint32_t sum2;
-
-  zero = _mm_set_epi32(0,0,0,0);
-  sum0 = zero;
-  sum1 = zero;
-
-  while(n) {
-    size_t k = (n >= 0xff) ? 0xff : n;
-    __m128i t0,t1;
-    __m128i s0 = zero;
-    __m128i s1 = zero;
-    n -= k;
-    while (k) {
-      __m128i src = _mm_load_si128((__m128i const*) p);
-      __m128i t;
-
-      t = _mm_unpacklo_epi8(src, zero);
-      s0 = _mm_adds_epu16(s0, t);
-      t = _mm_unpackhi_epi8(src, zero);
-      s1 = _mm_adds_epu16(s1, t);
-      p += sizeof(src);
-      k--;
-    }
-
-    // LOW - combine S0 and S1 into sum0
-    t0 = _mm_unpacklo_epi16(s0, zero);
-    sum0 = _mm_add_epi32(sum0, t0);
-    t1 = _mm_unpacklo_epi16(s1, zero);
-    sum1 = _mm_add_epi32(sum1, t1);
-
-    // HIGH - combine S0 and S1 into sum1
-    t0 = _mm_unpackhi_epi16(s0, zero);
-    sum0 = _mm_add_epi32(sum0, t0);
-    t1 = _mm_unpackhi_epi16(s1, zero);
-    sum1 = _mm_add_epi32(sum1, t1);
-  }
-  // here we must sum the 4-32 bit sums into one 32 bit sum
-  _mm_store_si128((__m128i*)s, sum0);
-  sum2 = (s[0]<<8) + s[1] + (s[2]<<8) + s[3];
-  _mm_store_si128((__m128i*)s, sum1);
-  sum2 += (s[0]<<8) + s[1] + (s[2]<<8) + s[3];
-  return sum2;
-}
-
-uint16_t cksum_sse2(unsigned char *p, size_t n, uint16_t initial)
-{
-  uint32_t sum = initial;
-
-  if (n < 128) { return cksum_generic(p, n, initial); }
-  int unaligned = (unsigned long) p & 0xf;
-  if (unaligned) {
-    size_t k = (0x10 - unaligned) >> 1;
-    sum += cksum_ua_loop(p, k);
-    n -= (2*k);
-    p += (2*k);
-  }
-  if (n >= 32) {  // fast even with only two vectors
-    size_t k = (n >> 4);
-    sum += cksum_sse2_loop(p, k);
-    n -= (16*k);
-    p += (16*k);
-  }
-  if (n > 1) {
-    size_t k = (n>>1);   // number of 16-bit words
-    sum += cksum_ua_loop(p, k);
-    n -= (2*k);
-    p += (2*k);
-  }
-  if (n)       // take care of left over byte
-    sum += (p[0] << 8);
-  while(sum>>16)
-    sum = (sum & 0xFFFF) + (sum>>16);
-  return (uint16_t)~sum;
-}
diff --git a/src/lib/checksum.h b/src/lib/checksum.h
index 008bfa190e..2081d84459 100644
--- a/src/lib/checksum.h
+++ b/src/lib/checksum.h
@@ -1,17 +1,12 @@
 /* Use of this source code is governed by the Apache 2.0 license; see COPYING. */
 
-// Calculate IP checksum using SSE2 instructions.
-// (This will crash if you call it on a CPU that does not support SSE.)
-uint16_t cksum_sse2(unsigned char *p, size_t n, uint16_t initial);
+// Calculate IP checksum.
+uint16_t cksum(unsigned char *p, size_t n, uint16_t initial);
 
 // Calculate IP checksum using AVX2 instructions.
 // (This will crash if you call it on a CPU that does not support AVX2.)
 uint16_t cksum_avx2(unsigned char *p, size_t n, uint16_t initial);
 
-// Calculate IP checksum using portable C code.
-// This works on all hardware.
-uint16_t cksum_generic(unsigned char *p, size_t n, uint16_t initial);
-
 // Incrementally update checksum when modifying a 16-bit value.
 void checksum_update_incremental_16(uint16_t* checksum_cell,
                                     uint16_t* value_cell,
diff --git a/src/lib/checksum.lua b/src/lib/checksum.lua
index 05ebc07849..de3ae151a6 100644
--- a/src/lib/checksum.lua
+++ b/src/lib/checksum.lua
@@ -15,12 +15,8 @@ local band, lshift = bit.band, bit.lshift
 local cpuinfo = lib.readfile("/proc/cpuinfo", "*a")
 assert(cpuinfo, "failed to read /proc/cpuinfo for hardware check")
 local have_avx2 = cpuinfo:match("avx2")
-local have_sse2 = cpuinfo:match("sse2")
-
-if     have_avx2 then ipsum = C.cksum_avx2
-elseif have_sse2 then ipsum = C.cksum_sse2
-else                  ipsum = C.cksum_generic end
 
+if have_avx2 then ipsum = C.cksum_avx2 else ipsum = C.cksum end
 
 function finish_packet (buf, len, offset)
    ffi.cast('uint16_t *', buf+offset)[0] = lib.htons(ipsum(buf, len, 0))
@@ -102,27 +98,22 @@ end
 
 function selftest ()
    print("selftest: checksum")
-   local tests = 1000
+   local tests = 10000
    local n = 1000000
    local array = ffi.new("char[?]", n)
-   for i = 0, n-1 do  array[i] = i  end
+   for i = 0, n-1 do  array[i] = math.random(256)  end
    local avx2ok, sse2ok = 0, 0
    for i = 1, tests do
       local initial = math.random(0, 0xFFFF)
-      local ref =   C.cksum_generic(array+i*2, i*10+i, initial)
+      local ref =   C.cksum(array+i*2, i*10+i, initial)
       if have_avx2 and C.cksum_avx2(array+i*2, i*10+i, initial) == ref then
          avx2ok = avx2ok + 1
       end
-      if have_sse2 and C.cksum_sse2(array+i*2, i*10+i, initial) == ref then
-         sse2ok = sse2ok + 1
-      end
       assert(ipsum(array+i*2, i*10+i, initial) == ref, "API function check")
    end
    if have_avx2 then print("avx2: "..avx2ok.."/"..tests) else print("no avx2") end
-   if have_sse2 then print("sse2: "..sse2ok.."/"..tests) else print("no sse2") end
    selftest_ipv4_tcp()
    assert(not have_avx2 or avx2ok == tests, "AVX2 test failed")
-   assert(not have_sse2 or sse2ok == tests, "SSE2 test failed")
    print("selftest: ok")
 end
 
diff --git a/src/lib/checksum.c b/src/lib/checksum_extra.c
similarity index 65%
rename from src/lib/checksum.c
rename to src/lib/checksum_extra.c
index b75d1b40e0..f6e8e5c910 100644
--- a/src/lib/checksum.c
+++ b/src/lib/checksum_extra.c
@@ -1,67 +1,9 @@
-/* Use of this source code is governed by the Apache 2.0 license; see COPYING.
- * Generic checksm routine originally taken from DPDK: 
- *   BSD license; (C) Intel 2010-2015, 6WIND 2014. */
-
-/* IP checksum routines.
- *
- * See src/arch/ for architecture specific SIMD versions. */
+/* Use of this source code is governed by the Apache 2.0 license; see COPYING. */
+/* IP checksum routines. */
 
 #include <arpa/inet.h>
-#include <stdio.h>
-#include <stddef.h>
-#include <string.h>
-#include <stdlib.h>
 #include <stdint.h>
-#include <sys/time.h>
-
-uint16_t cksum_generic(unsigned char *p, size_t len, uint16_t initial)
-{
-  uint32_t sum = htons(initial);
-  const uint16_t *u16 = (const uint16_t *)p;
-
-  while (len >= (sizeof(*u16) * 4)) {
-    sum += u16[0];
-    sum += u16[1];
-    sum += u16[2];
-    sum += u16[3];
-    len -= sizeof(*u16) * 4;
-    u16 += 4;
-  }
-  while (len >= sizeof(*u16)) {
-    sum += *u16;
-    len -= sizeof(*u16);
-    u16 += 1;
-  }
-
-  /* if length is in odd bytes */
-  if (len == 1)
-    sum += *((const uint8_t *)u16);
-
-  while(sum>>16)
-    sum = (sum & 0xFFFF) + (sum>>16);
-  return ntohs((uint16_t)~sum);
-}
-
-// SIMD versions
-
-//
-// A unaligned version of the cksum,
-// n is number of 16-bit values to sum over, n in it self is a
-// 16 bit number in order to avoid overflow in the loop
-//
-static inline uint32_t cksum_ua_loop(unsigned char *p, uint16_t n)
-{
-  uint32_t s0 = 0;
-  uint32_t s1 = 0;
-
-  while (n) {
-    s0 += p[0];
-    s1 += p[1];
-    p += 2;
-    n--;
-  }
-  return (s0<<8)+s1;
-}
+#include "checksum.h"
 
 // Incrementally update checksum when modifying a 16-bit value.
 void checksum_update_incremental_16(uint16_t* checksum_cell,
@@ -125,7 +67,7 @@ uint32_t pseudo_header_initial(const int8_t *buf, size_t len)
     uint32_t sum = 0;
     len -= headersize;
     if (ipv == 4) {                         // IPv4
-      if (cksum_generic((unsigned char *)buf, headersize, 0) != 0) {
+      if (cksum((unsigned char *)buf, headersize, 0) != 0) {
         return 0xFFFF0002;
       }
       sum = htons(len & 0x0000FFFF) + (proto << 8)
diff --git a/src/program/snabbmark/snabbmark.lua b/src/program/snabbmark/snabbmark.lua
index 990eb28487..77885478f2 100644
--- a/src/program/snabbmark/snabbmark.lua
+++ b/src/program/snabbmark/snabbmark.lua
@@ -334,7 +334,7 @@ end
 -- Checksum benchmark
 
 function checksum1 (size_min, size_max, verbose)
-   local loops = 1000
+   local loops = 10000
    local inputs = 1000
    local sizes  = {}
    local arrays = {}
@@ -371,8 +371,7 @@ function checksum1 (size_min, size_max, verbose)
    local pmu_events = {}
    _, r.asm  = pmu.measure(cksum(simd.cksum),      pmu_events, pmu_aux)
    _, r.avx2 = pmu.measure(cksum(C.cksum_avx2),    pmu_events, pmu_aux)
-   _, r.sse2 = pmu.measure(cksum(C.cksum_sse2),    pmu_events, pmu_aux)
-   _, r.base  = pmu.measure(cksum(C.cksum_generic), pmu_events, pmu_aux)
+   _, r.base  = pmu.measure(cksum(C.cksum), pmu_events, pmu_aux)
    print(("%-14s %14s %14s %14s"):format("VARIANT", "BYTES/PACKET", "BYTES/CYCLE", "CYCLES/PACKET"))
    local totalbytes = bytes * loops
    for variant, result in pairs(r) do