From 9333b379aa4309ad91255f5ff7cfd40fbdae06ed Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sun, 17 Jan 2016 06:11:42 +0000 Subject: [PATCH 1/4] lib.blit: Added "blitter" module This is a simple placeholder implementation for an optimized bit-blitting API. --- src/lib/blit.lua | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) create mode 100644 src/lib/blit.lua diff --git a/src/lib/blit.lua b/src/lib/blit.lua new file mode 100644 index 0000000000..ce4a31ff1f --- /dev/null +++ b/src/lib/blit.lua @@ -0,0 +1,33 @@ +-- blit.lua - offload engine for memory operations + +module(..., package.seeall) + +local ffi = require("ffi") + +-- The blit module provides "blitter" operation to offload +-- performance-critical memory operations. The API allows scheduling a +-- series of operations, that can be performed at any time and in any +-- order, and then executing a "barrier" to wait for completion. + +-- The implementation in this file is very basic but could be extended +-- in the future to take advantage of the flexibility afforded by the +-- API to perform special optimizations (for example parallel memory +-- copies to amortize cache latency, etc). + +function copy (dst, src, len) + -- Trivial implementation: simply do an immediate memory copy. + ffi.copy(dst, src, len) +end + +-- Wait until all copies have completed. +function barrier () + -- No-op because the copies were already executed eagerly. +end + +function selftest () + print("selftest: blit") + -- It would be valuable to have an extensive selftest function to + -- make it easy to develop and test new optimized blitter + -- implementations. + print("selftest: ok") +end From b93121383ab8444edb995e26e4a0c8b8ad25488f Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Sun, 17 Jan 2016 06:12:56 +0000 Subject: [PATCH 2/4] (WIP) virtio net_device: Perform VM copies with lib.blit Update the vhost-user code to perform Snabb<->VM memory copies via the lib.blit module. This allows experimental optimizations with local changes to the blit module. This essentially separates "virtio vring processing" and "virtio memory copies" into being two separate problems that can be profiled and optimized separately. This is work-in-progress: Care must be taken not to let the guest see that packets are available until the blit.barrier() operation has been executed and I think this will require moving the ring index updates. --- src/lib/virtio/net_device.lua | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lib/virtio/net_device.lua b/src/lib/virtio/net_device.lua index 58b040be30..4cf0381989 100644 --- a/src/lib/virtio/net_device.lua +++ b/src/lib/virtio/net_device.lua @@ -11,6 +11,7 @@ local packet = require("core.packet") local timer = require("core.timer") local vq = require("lib.virtio.virtq") local checksum = require("lib.checksum") +local blit = require("lib.blit") local ffi = require("ffi") local C = ffi.C local band = bit.band @@ -106,6 +107,7 @@ end function VirtioNetDevice:poll_vring_receive () -- RX self:receive_packets_from_vm() + blit.barrier() self:rx_signal_used() end @@ -139,7 +141,8 @@ function VirtioNetDevice:rx_buffer_add(rx_p, addr, len) local addr = self:map_from_guest(addr) local pointer = ffi.cast(char_ptr_t, addr) - packet.append(rx_p, pointer, len) + rx_p.length = rx_p.length + len + blit.copy(rx_p, pointer, len) return len end @@ -174,6 +177,7 @@ end function VirtioNetDevice:poll_vring_transmit () -- RX self:transmit_packets_to_vm() + blit.barrier() self:tx_signal_used() end @@ -239,7 +243,7 @@ function VirtioNetDevice:tx_buffer_add(tx_p, addr, len) local pointer = ffi.cast(char_ptr_t, addr) assert(tx_p.length <= len) - ffi.copy(pointer, tx_p.data, tx_p.length) + blit.copy(pointer, tx_p.data, tx_p.length) return tx_p.length end @@ -288,7 +292,7 @@ function VirtioNetDevice:tx_buffer_add_mrg_rxbuf(tx_p, addr, len) local to_copy = math.min(tx_p.length - self.tx.data_sent, len + adjust) -- copy the data to the adjusted pointer - ffi.copy(pointer - adjust, tx_p.data + self.tx.data_sent, to_copy) + ffi.copy(tx_p.data + self.tx.data_sent, pointer - adjust, to_copy) -- update the num_buffers in the first virtio header self.tx.tx_mrg_hdr[0].num_buffers = self.tx.tx_mrg_hdr[0].num_buffers + 1 From d0e919489dd9b6cae0520e2f3a13c6efa8d244a7 Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Mon, 18 Jan 2016 23:54:52 +0000 Subject: [PATCH 3/4] lib.blit: Draft assembler code version work in progress / not complete: - Rounds all copies up to 32-bytes. - Fails NFV benchmark test. --- src/lib/blit.dasl | 132 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 src/lib/blit.dasl diff --git a/src/lib/blit.dasl b/src/lib/blit.dasl new file mode 100644 index 0000000000..e74846dfcc --- /dev/null +++ b/src/lib/blit.dasl @@ -0,0 +1,132 @@ +-- blit.lua - offload engine for memory operations + +module(..., package.seeall) + +local dasm = require("dasm") +local ffi = require("ffi") + +|.arch x64 +|.actionlist actions + +-- The blit module provides "blitter" operation to offload +-- performance-critical memory operations. The API allows scheduling a +-- series of operations, that can be performed at any time and in any +-- order, and then executing a "barrier" to wait for completion. + +ffi.cdef[[ + struct blit_queue_entry { + void *src; + void *dst; + uint64_t len; + }; +]] + +-- Queue of memory operations +local maxqueue = 10000 +local queue = ffi.new("struct blit_queue_entry[?]", maxqueue+2) +local nqueued = 0 + +function copy (dst, src, len) + assert(len%32==0) + if nqueued == maxqueue then barrier() end + queue[nqueued].src = src + queue[nqueued].dst = dst + queue[nqueued].len = len + nqueued = nqueued + 1 + -- Sentinel + queue[nqueued].src = nil +end + +-- Assembler code for the barrier operation +function gen_barrier (Dst) + | ret + | mov64 r8, queue + + |->queue: + | cmp qword [r8], 0 -- sentinel? + | je >9 + + -- Load parameters for the next copy + | mov rsi, [r8] + | mov rdi, [r8+8] + | mov rax, [r8+16] + + -- Copy 32 bytes at a time + | xor rcx, rcx + |->copy: + | vmovdqu ymm0, [rsi+rcx] + | vmovdqu [rdi+rcx], ymm0 + | add rcx, 32 + | cmp rcx, rax + | jl ->copy + + -- Advance to the next copy in the queue + | add r8, ffi.sizeof('struct blit_queue_entry') + | jmp ->queue + |9: + | ret +end + +-- XXX the code below is copy-paste and should be reused somehow. +local debug = false +local anchor = {} +-- Utility: assemble code and optionally dump disassembly. +local function assemble (name, prototype, generator) + local Dst = dasm.new(actions) + generator(Dst) + local mcode, size = Dst:build() + table.insert(anchor, mcode) + if debug then + print("mcode dump: "..name) + dasm.dump(mcode, size) + end + return ffi.cast(prototype, mcode) +end + +-- Machine code for the barrier function +local asm_barrier = assemble("barrier", "void(*)()", gen_barrier) + +-- Wait until all copies have completed. +function barrier () + asm_barrier() + nqueued = 0 + queue[0].src = nil -- sentinel +end + +-- Test by doing the same random copies with blit and ffi.copy() and +-- comparing the results. +function selftest () + print("selftest: blit") + local membytes = 10240 + local memx = ffi.new("char[?]", membytes) + local memy = ffi.new("char[?]", membytes) + for i = 0, 10 do + print("loop "..i) + -- Initialize memx and memy with identical randomly chosen values + for i = 0, membytes-1 do + local n = math.random(256) + memx[i] = n + memy[i] = n + end + -- Perform some random copies + for i = 0, math.random(1000) do + local srcoffset = math.random(1000) + local dstoffset = math.random(1000) + local length = math.random(8) * 32 + 32 + copy (memx+dstoffset, memx+srcoffset+5120, length) + ffi.copy(memy+dstoffset, memy+srcoffset+5120, length) + end + -- Execute deferred copies + barrier() + -- Check for same contents + for i = 0, membytes-1 do + if memx[i] ~= memy[i] then + print(require("core.lib").hexdump(ffi.string(memx+i, 32))) + print(require("core.lib").hexdump(ffi.string(memy+i, 32))) + error("mismatch at byte " .. i) + end + end + end + print("selftest: ok") +end + From a3c65f08e73f0e376d15cda965d91a71b5e6802a Mon Sep 17 00:00:00 2001 From: Luke Gorrie Date: Tue, 19 Jan 2016 13:21:56 +0000 Subject: [PATCH 4/4] lib.blit: Added blitter implementation in assembler [wip] The lib.blit API is now implemented by an assembler routine that batches copies together. This is a work in progress due to one major restriction: copy length has to be a multiple of 32 bytes. --- src/lib/blit.dasl | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/src/lib/blit.dasl b/src/lib/blit.dasl index e74846dfcc..e36bbb652c 100644 --- a/src/lib/blit.dasl +++ b/src/lib/blit.dasl @@ -13,6 +13,10 @@ local ffi = require("ffi") -- series of operations, that can be performed at any time and in any -- order, and then executing a "barrier" to wait for completion. +-- This module implements a blitter that defers all copy operations +-- until the barrier() is invoked and then executes them with a single +-- assembler code routine. + ffi.cdef[[ struct blit_queue_entry { void *src; @@ -27,7 +31,8 @@ local queue = ffi.new("struct blit_queue_entry[?]", maxqueue+2) local nqueued = 0 function copy (dst, src, len) - assert(len%32==0) + -- XXX This routine is hard-coded for multiples of 32 bytes. + assert(len%32 == 0) if nqueued == maxqueue then barrier() end queue[nqueued].src = src queue[nqueued].dst = dst @@ -39,7 +44,6 @@ end -- Assembler code for the barrier operation function gen_barrier (Dst) - | ret | mov64 r8, queue |->queue: @@ -47,17 +51,17 @@ function gen_barrier (Dst) | je >9 -- Load parameters for the next copy - | mov rsi, [r8] - | mov rdi, [r8+8] - | mov rax, [r8+16] + | mov rsi, [r8] -- source + | mov rdi, [r8+8] -- destination + | mov rcx, [r8+16] -- length -- Copy 32 bytes at a time - | xor rcx, rcx + | xor rax, rax |->copy: - | vmovdqu ymm0, [rsi+rcx] - | vmovdqu [rdi+rcx], ymm0 - | add rcx, 32 - | cmp rcx, rax + | vmovdqu ymm0, [rsi+rax] + | vmovdqu [rdi+rax], ymm0 + | add rax, 32 + | cmp rax, rcx | jl ->copy -- Advance to the next copy in the queue @@ -69,7 +73,7 @@ end -- XXX the code below is copy-paste and should be reused somehow. local debug = false -local anchor = {} +anchor = {} -- Utility: assemble code and optionally dump disassembly. local function assemble (name, prototype, generator) local Dst = dasm.new(actions) @@ -130,3 +134,5 @@ function selftest () print("selftest: ok") end +function copy (dst, src, len) ffi.copy(dst, src, len) end +function barrier () end