-
Notifications
You must be signed in to change notification settings - Fork 299
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Optimized "blitter" routine written in assembler [wip] #719
Open
lukego
wants to merge
4
commits into
snabbco:master
Choose a base branch
from
lukego:blit-asm
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
Show all changes
4 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
-- blit.lua - offload engine for memory operations | ||
|
||
module(..., package.seeall) | ||
|
||
local dasm = require("dasm") | ||
local ffi = require("ffi") | ||
|
||
|.arch x64 | ||
|.actionlist actions | ||
|
||
-- The blit module provides "blitter" operation to offload | ||
-- performance-critical memory operations. The API allows scheduling a | ||
-- series of operations, that can be performed at any time and in any | ||
-- order, and then executing a "barrier" to wait for completion. | ||
|
||
-- This module implements a blitter that defers all copy operations | ||
-- until the barrier() is invoked and then executes them with a single | ||
-- assembler code routine. | ||
|
||
ffi.cdef[[ | ||
struct blit_queue_entry { | ||
void *src; | ||
void *dst; | ||
uint64_t len; | ||
}; | ||
]] | ||
|
||
-- Queue of memory operations | ||
local maxqueue = 10000 | ||
local queue = ffi.new("struct blit_queue_entry[?]", maxqueue+2) | ||
local nqueued = 0 | ||
|
||
function copy (dst, src, len) | ||
-- XXX This routine is hard-coded for multiples of 32 bytes. | ||
assert(len%32 == 0) | ||
if nqueued == maxqueue then barrier() end | ||
queue[nqueued].src = src | ||
queue[nqueued].dst = dst | ||
queue[nqueued].len = len | ||
nqueued = nqueued + 1 | ||
-- Sentinel | ||
queue[nqueued].src = nil | ||
end | ||
|
||
-- Assembler code for the barrier operation | ||
function gen_barrier (Dst) | ||
| mov64 r8, queue | ||
|
||
|->queue: | ||
| cmp qword [r8], 0 -- sentinel? | ||
| je >9 | ||
|
||
-- Load parameters for the next copy | ||
| mov rsi, [r8] -- source | ||
| mov rdi, [r8+8] -- destination | ||
| mov rcx, [r8+16] -- length | ||
|
||
-- Copy 32 bytes at a time | ||
| xor rax, rax | ||
|->copy: | ||
| vmovdqu ymm0, [rsi+rax] | ||
| vmovdqu [rdi+rax], ymm0 | ||
| add rax, 32 | ||
| cmp rax, rcx | ||
| jl ->copy | ||
|
||
-- Advance to the next copy in the queue | ||
| add r8, ffi.sizeof('struct blit_queue_entry') | ||
| jmp ->queue | ||
|9: | ||
| ret | ||
end | ||
|
||
-- XXX the code below is copy-paste and should be reused somehow. | ||
local debug = false | ||
anchor = {} | ||
-- Utility: assemble code and optionally dump disassembly. | ||
local function assemble (name, prototype, generator) | ||
local Dst = dasm.new(actions) | ||
generator(Dst) | ||
local mcode, size = Dst:build() | ||
table.insert(anchor, mcode) | ||
if debug then | ||
print("mcode dump: "..name) | ||
dasm.dump(mcode, size) | ||
end | ||
return ffi.cast(prototype, mcode) | ||
end | ||
|
||
-- Machine code for the barrier function | ||
local asm_barrier = assemble("barrier", "void(*)()", gen_barrier) | ||
|
||
-- Wait until all copies have completed. | ||
function barrier () | ||
asm_barrier() | ||
nqueued = 0 | ||
queue[0].src = nil -- sentinel | ||
end | ||
|
||
-- Test by doing the same random copies with blit and ffi.copy() and | ||
-- comparing the results. | ||
function selftest () | ||
print("selftest: blit") | ||
local membytes = 10240 | ||
local memx = ffi.new("char[?]", membytes) | ||
local memy = ffi.new("char[?]", membytes) | ||
for i = 0, 10 do | ||
print("loop "..i) | ||
-- Initialize memx and memy with identical randomly chosen values | ||
for i = 0, membytes-1 do | ||
local n = math.random(256) | ||
memx[i] = n | ||
memy[i] = n | ||
end | ||
-- Perform some random copies | ||
for i = 0, math.random(1000) do | ||
local srcoffset = math.random(1000) | ||
local dstoffset = math.random(1000) | ||
local length = math.random(8) * 32 + 32 | ||
copy (memx+dstoffset, memx+srcoffset+5120, length) | ||
ffi.copy(memy+dstoffset, memy+srcoffset+5120, length) | ||
end | ||
-- Execute deferred copies | ||
barrier() | ||
-- Check for same contents | ||
for i = 0, membytes-1 do | ||
if memx[i] ~= memy[i] then | ||
print(require("core.lib").hexdump(ffi.string(memx+i, 32))) | ||
print(require("core.lib").hexdump(ffi.string(memy+i, 32))) | ||
error("mismatch at byte " .. i) | ||
end | ||
end | ||
end | ||
print("selftest: ok") | ||
end | ||
|
||
function copy (dst, src, len) ffi.copy(dst, src, len) end | ||
function barrier () end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,33 @@ | ||
-- blit.lua - offload engine for memory operations | ||
|
||
module(..., package.seeall) | ||
|
||
local ffi = require("ffi") | ||
|
||
-- The blit module provides "blitter" operation to offload | ||
-- performance-critical memory operations. The API allows scheduling a | ||
-- series of operations, that can be performed at any time and in any | ||
-- order, and then executing a "barrier" to wait for completion. | ||
|
||
-- The implementation in this file is very basic but could be extended | ||
-- in the future to take advantage of the flexibility afforded by the | ||
-- API to perform special optimizations (for example parallel memory | ||
-- copies to amortize cache latency, etc). | ||
|
||
function copy (dst, src, len) | ||
-- Trivial implementation: simply do an immediate memory copy. | ||
ffi.copy(dst, src, len) | ||
end | ||
|
||
-- Wait until all copies have completed. | ||
function barrier () | ||
-- No-op because the copies were already executed eagerly. | ||
end | ||
|
||
function selftest () | ||
print("selftest: blit") | ||
-- It would be valuable to have an extensive selftest function to | ||
-- make it easy to develop and test new optimized blitter | ||
-- implementations. | ||
print("selftest: ok") | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You might want to experiment unrolling this manually. I got some significant speedups by having more loads in flight.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is quite delicate :-). I started out with an unrolled version of the inner loop and then found that the looping version delivered the same performance. There have been other very innocent code variations that were much slower though. I want to use the PMU to explore these differences.
I would like to try unrolling the outer loop though to see if coping several packets in parallel could help.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, delicate indeed :) One thing to try is instead of doing load, store, load, store, to do load, load, store, store. That was what worked best for me. Good luck :)