Skip to content
This repository has been archived by the owner on May 27, 2021. It is now read-only.

EXPERIMENTAL: Implement a GC #419

Open
wants to merge 150 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
150 commits
Select commit Hold shift + click to select a range
5739881
Implement a lowering for the intrinsics generated by 'LateLowerGCFrame'
jonathanvdc Feb 21, 2019
61b9f94
Also lower 'julia.queue_gc_root'
jonathanvdc Feb 22, 2019
a921f3a
Fix correctness bugs in the new GC lowering pass
jonathanvdc Feb 22, 2019
80af54b
Use the new GC intrinsic lowering
jonathanvdc Feb 27, 2019
f177a27
Add a simple unified memory example
jonathanvdc Feb 28, 2019
5fd8a0a
Add a host-to-device communication example
jonathanvdc Feb 28, 2019
1c250c7
Fix an outdated comment
jonathanvdc Feb 28, 2019
f8e6c4b
Add a kwarg to '@cuda' that serves as a hook for kernel setup
jonathanvdc Mar 1, 2019
5426bec
Add an example that initializes a kernel global
jonathanvdc Mar 1, 2019
537bfca
Include an atomic cmpxchg example
jonathanvdc Mar 1, 2019
614d04b
Create a fully-featured interrupt example
jonathanvdc Mar 4, 2019
6ed1acf
Update interrupt example to include memory transfer during interrupts
jonathanvdc Mar 4, 2019
d960a91
Define a high-level interrupt interface
jonathanvdc Mar 4, 2019
7c627c0
Refactor interrupt examples
jonathanvdc Mar 4, 2019
c45f33d
Document interrupt API
jonathanvdc Mar 4, 2019
7c6906b
Define interrupt tests
jonathanvdc Mar 4, 2019
9c73a28
Add another interrupt test
jonathanvdc Mar 4, 2019
47439eb
Remove experimental examples
jonathanvdc Mar 4, 2019
297bedc
Implement a reader-writer lock
jonathanvdc Mar 5, 2019
cfb6dd8
Create an allocator prototype for the GC
jonathanvdc Mar 5, 2019
279f6ff
Rename 'GCFreeListEntry' to 'GCAllocationRecord'
jonathanvdc Mar 6, 2019
c0c06e2
Avoid partially overwriting allocation records
jonathanvdc Mar 6, 2019
79dc0d4
Refactor GC collection triggering logic
jonathanvdc Mar 6, 2019
563c3c0
Have the GC maintain a list of allocated blocks
jonathanvdc Mar 6, 2019
4ffc62f
Introduce the notion of a GC master record
jonathanvdc Mar 6, 2019
454a6ef
Reserve GC memory for GC frames
jonathanvdc Mar 6, 2019
24c184f
Have the GC allocate memory for root buffer sizes
jonathanvdc Mar 6, 2019
33e54b7
Use 32-bit integers to describe GC root buffer sizes
jonathanvdc Mar 6, 2019
71aa78f
Define GC frame management functions
jonathanvdc Mar 6, 2019
da046af
Make globals created by 'get_global_pointer' 'linkonce_odr'
jonathanvdc Mar 6, 2019
159acd3
Protect newly allocated objects from collection
jonathanvdc Mar 6, 2019
2b77228
Introduce a separate GPU GC lowering pass
jonathanvdc Mar 6, 2019
9a3da04
Use 'gc_malloc' instead of regular 'malloc' when in GC mode
jonathanvdc Mar 6, 2019
5bd8da4
Use pointers instead of integers to keep track of GC frames
jonathanvdc Mar 6, 2019
f560be6
Lower GC frame management intrinsics to GPU GC calls
jonathanvdc Mar 6, 2019
53db509
Allow GC frame management functions to execute concurrently with the GC
jonathanvdc Mar 7, 2019
358ceae
Move GC frame management functions into 'gc.jl'
jonathanvdc Mar 7, 2019
ecc601d
Mark GC frame management functions as '@inline'
jonathanvdc Mar 7, 2019
dcec58d
Update 'get_thread_id' to take blocks into account
jonathanvdc Mar 7, 2019
f198cf8
Introduce GC heap management data structures
jonathanvdc Mar 7, 2019
d039839
Implement the mark & sweep phases of the GC
jonathanvdc Mar 7, 2019
7358f9c
Implement a free list compaction and extra memory allocation scheme
jonathanvdc Mar 7, 2019
457006a
Update GC docs
jonathanvdc Mar 8, 2019
0666d09
Modify GC lock acquisition scheme slightly
jonathanvdc Mar 8, 2019
0f1ccc6
Avoid overly frequent garbage collections
jonathanvdc Mar 8, 2019
e48677e
Document free list compaction
jonathanvdc Mar 8, 2019
3ec9f48
Reserve a buffer for safepoints
jonathanvdc Mar 8, 2019
ef90bb4
Implement a safepoint function
jonathanvdc Mar 8, 2019
a76a568
Put safepoint flag values in an enum
jonathanvdc Mar 8, 2019
23e128c
Implement stop-the-world part of the GC
jonathanvdc Mar 8, 2019
0875425
Automatically insert safepoints
jonathanvdc Mar 8, 2019
3ad1ee8
Update GC example
jonathanvdc Mar 8, 2019
a0fbee8
Add a binary search tree example
jonathanvdc Mar 11, 2019
3e2a8ff
Use local arenas to reduce GC lock contention
jonathanvdc Mar 11, 2019
48eb3f5
Automatically insert perma-safepoints
jonathanvdc Mar 11, 2019
4a634e4
Add a comprehensive GC test
jonathanvdc Mar 11, 2019
d1ce8c7
Do not serialize warps for reader locks
jonathanvdc Mar 15, 2019
5131681
Define a GPU Mutex type
jonathanvdc Mar 15, 2019
ce75ce8
Collect GC statistics
jonathanvdc Mar 17, 2019
8745e96
Add a matrix example
jonathanvdc Mar 17, 2019
cc11f57
Amend binary tree example with a no-gc mode
jonathanvdc Mar 17, 2019
2f34088
Measure GC polling times
jonathanvdc Mar 18, 2019
cec2dcc
Rename GC free list data structures
jonathanvdc Mar 18, 2019
f4cdf0b
Implement a ScatterAlloc-based allocator
jonathanvdc Mar 18, 2019
6e14a2d
Make the allocator smarter
jonathanvdc Mar 19, 2019
699fcea
Tweak GC memory hierarchy
jonathanvdc Mar 20, 2019
bdd6c0b
Create a linked list example
jonathanvdc Mar 20, 2019
1b93aba
Fix imperfect rebase
jonathanvdc Mar 21, 2019
bb03af2
Add a StaticArrays-based GC example
jonathanvdc Mar 21, 2019
46614f3
Teach allocator to transfer memory block ownership
jonathanvdc Apr 3, 2019
d60114b
Update examples
jonathanvdc Apr 3, 2019
82208c8
Introduce benchmarking utilities
jonathanvdc Apr 3, 2019
b18a0bb
Fix some typos
jonathanvdc Apr 3, 2019
11042c4
Update '@cuda_interruptible'
jonathanvdc Apr 3, 2019
b183aaf
Don't try to include deleted intrinsics test file
jonathanvdc Apr 3, 2019
0402ad8
Switch back to free lists for local arenas
jonathanvdc Apr 3, 2019
b9029da
Fix GC collection bug
jonathanvdc Apr 3, 2019
7e20c66
Reduce initial GC heap size
jonathanvdc Apr 3, 2019
51ca870
Update benchmarking utilities
jonathanvdc Apr 3, 2019
4390d90
Put GC benchmarks in a separate directory
jonathanvdc Apr 3, 2019
a3fb290
Rename linked list benchmark import
jonathanvdc Apr 4, 2019
79ea2a1
Rename matrix GC benchmark
jonathanvdc Apr 4, 2019
f2dbf3f
Set the malloc heap size when running benchmarks
jonathanvdc Apr 4, 2019
b57b902
Add an array benchmark
jonathanvdc Apr 4, 2019
89d8bbc
Reuse 'device_reset!' in benchmarking utils
jonathanvdc Apr 4, 2019
fac07ef
Create a GC benchmark driver
jonathanvdc Apr 9, 2019
f8d4ede
Include an SSA IR optimization benchmark
jonathanvdc Apr 9, 2019
67ac9de
Tweak ssa-opt benchmark comment
jonathanvdc Apr 9, 2019
727a928
Write benchmark results to a CSV
jonathanvdc Apr 9, 2019
fabdea9
Add two additional GC benchmarks
jonathanvdc Apr 10, 2019
8dba84d
Support creating one-dimensional arrays
jonathanvdc Apr 15, 2019
6c4450f
Rename "arrays" benchmark to "static-arrays"
jonathanvdc Apr 15, 2019
d88313e
Support arrays in regular @cuda code
jonathanvdc Apr 15, 2019
ec5290e
Define a new 'arrays' benchmark
jonathanvdc Apr 15, 2019
47a52b4
Rename "gpu-array" example to "stdlib-array"
jonathanvdc Apr 15, 2019
92847f0
Introduce unreachable objects in array benchmark
jonathanvdc Apr 15, 2019
844c557
Define an array reduction benchmark
jonathanvdc Apr 15, 2019
8a11408
Include array reduction benchmark in "run-all.jl"
jonathanvdc Apr 15, 2019
1874174
Add a bitvector benchmark
jonathanvdc Apr 19, 2019
de14a7f
Add a 'malloc' keyword argument to the @cuda macro
jonathanvdc Apr 19, 2019
2908cdd
Add a pass that rewrites calls to 'malloc'
jonathanvdc Apr 19, 2019
f6107eb
Recompile runtime library for different allocators
jonathanvdc Apr 19, 2019
84ffff5
Use 'gc_malloc' as allocator when @cuda_gc is specified
jonathanvdc Apr 19, 2019
e14b9b3
Implement array expansion method
jonathanvdc Apr 19, 2019
68f4747
Create an array expansion benchmark
jonathanvdc Apr 19, 2019
a6b49dc
Introduce a special 'managed_malloc' runtime function
jonathanvdc Apr 19, 2019
133101f
Implement 'managed_malloc' differently
jonathanvdc Apr 21, 2019
73018f5
Consider custom malloc during IR checking
jonathanvdc Apr 25, 2019
d505dad
Switch to acquire-release semantics for atomics
jonathanvdc May 5, 2019
1aad738
Expose GC configuration options
jonathanvdc May 6, 2019
7575204
Make genetic algo, ssa opt benchmarks quicker
jonathanvdc May 10, 2019
fc72737
Try two GC configs when running benchmarks
jonathanvdc May 10, 2019
573d580
Fold '@cuda_gc' into '@cuda'
jonathanvdc May 10, 2019
9ac081d
Reuse pinned memory support from CUDAdrv
jonathanvdc May 10, 2019
65d3933
Merge remote-tracking branch 'upstream/master' into gc-staging
jonathanvdc May 10, 2019
6362515
Merge remote-tracking branch 'upstream/master' into gc-staging
jonathanvdc May 10, 2019
e9b20c9
Merge branch 'gc-staging' of https://github.com/jonathanvdc/CUDAnativ…
jonathanvdc May 10, 2019
61818e8
Handle multi-dimensional 'thread' args gracefully
jonathanvdc May 10, 2019
35a3652
Define 'upload!', 'download' benchmark utils
jonathanvdc May 23, 2019
09edf89
Implement a bump allocator for kernels
jonathanvdc May 23, 2019
60d6fc6
Add a bump allocator to the GC benchmark configs
jonathanvdc May 23, 2019
1805d7f
Use 'managed_malloc' to implement 'gc_pool_alloc'
jonathanvdc May 23, 2019
d99ed4a
Update test runner to write bump allocator results
jonathanvdc May 23, 2019
c1356b1
Change how bump allocators are initialized
jonathanvdc May 23, 2019
2cdaf68
Implement jl_array_sizehint
jonathanvdc Jun 6, 2019
fc80975
Implement jl_array_grow_at
jonathanvdc Jun 6, 2019
f6d7b83
Implement 'jl_array_grow_beg'
jonathanvdc Jun 6, 2019
272e77e
Implement array deletion methods
jonathanvdc Jun 6, 2019
23fa152
Create an array feature-testing benchmark
jonathanvdc Jun 6, 2019
69a9dd5
Tweak a comment
jonathanvdc Jun 6, 2019
5a939f8
Implement jl_alloc_array_2d and jl_alloc_array_3d
jonathanvdc Jun 7, 2019
3382772
Better document array functions
jonathanvdc Jun 7, 2019
1942659
Implement jl_new_array
jonathanvdc Jun 10, 2019
8612466
Implement jl_ptr_to_array{,_1d}
jonathanvdc Jun 10, 2019
952a645
Compare GC strategies when running benchmarks
jonathanvdc Jun 11, 2019
be276cf
Tweak array-features benchmark
jonathanvdc Jun 11, 2019
85766d5
Update optim.jl to use stock Julia
jonathanvdc Jun 11, 2019
2e640f5
Fix misnomer in utils.jl
jonathanvdc Jun 11, 2019
bb7b440
Include mean in gc-heap-sizes.csv
jonathanvdc Jun 11, 2019
be1692c
Remove experimental allocator implementations
jonathanvdc Jun 11, 2019
2c058c7
Remove binary tree example
jonathanvdc Jun 11, 2019
c801922
Merge remote-tracking branch 'upstream/master' into gc-pr
jonathanvdc Jun 11, 2019
2f4f773
Update GC benchmark runner
jonathanvdc Jun 12, 2019
350f0ed
Tweak benchmarks
jonathanvdc Jun 14, 2019
93a2f57
Add a mean to 'strategies.csv' too
jonathanvdc Jun 14, 2019
2782244
Remove strategies.csv from root dir
jonathanvdc Jun 14, 2019
7380683
Include array reduction benchmark in GC benchmark suite
jonathanvdc Jun 17, 2019
c6390ed
Insert a root buffer overflow check
jonathanvdc Jun 22, 2019
a91baef
Update benchmarks with pinned memory bump allocator
jonathanvdc Jul 5, 2019
4b76aec
Write breakdown-computing code
jonathanvdc Jul 6, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions examples/gc.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
using CUDAdrv, CUDAnative
using Test

mutable struct TempStruct
data::Float32
end

@noinline function escape(val)
Base.pointer_from_objref(val)
end

function upload!(destination, source)
Mem.copy!(destination, pointer(source), sizeof(source))
end

function download(::Type{T}, source, dims) where T
result = Array{T}(undef, dims)
Mem.copy!(pointer(result), source, sizeof(result))
result
end

# Define a kernel that copies values using a temporary struct.
function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
i = (blockIdx().x - 1) * blockDim().x + threadIdx().x

for j in 1:2
# Allocate a mutable struct and make sure it ends up on the GC heap.
temp = TempStruct(unsafe_load(a, i))
escape(temp)

# Allocate a large garbage buffer to force collections.
gc_malloc(Csize_t(256 * 1024))

# Use the mutable struct. If its memory has been reclaimed (by accident)
# then we expect the test at the end of this file to fail.
unsafe_store!(b, temp.data, i)
end

return
end

thread_count = 256

# Allocate two arrays.
source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)

# Fill the source and destination arrays.
upload!(source_array, fill(42.f0, thread_count))
upload!(destination_array, zeros(Float32, thread_count))

# Run the kernel.
@cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)

@test download(Float32, destination_array, thread_count) == fill(42.f0, thread_count)
54 changes: 54 additions & 0 deletions examples/interrupt-memory.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
using CUDAdrv, CUDAnative
using Test

function upload!(destination, source)
Mem.copy!(destination, pointer(source), sizeof(source))
end

function download(::Type{T}, source, dims) where T
result = Array{T}(undef, dims)
Mem.copy!(pointer(result), source, sizeof(result))
result
end

# Define a kernel that copies some data from one array to another.
# The host is invoked to populate the source array.
function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
interrupt_or_wait()
threadfence_system()
Base.unsafe_store!(b, Base.unsafe_load(a, i), i)
return
end

thread_count = 64

# Allocate two arrays.
source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)

# Zero-fill the source and destination arrays.
upload!(source_array, zeros(Float32, thread_count))
upload!(destination_array, zeros(Float32, thread_count))

# Define one stream for kernel execution and another for
# data transfer.
data_stream = CuStream()
exec_stream = CuStream()

# Define a magic value.
magic = 42.f0

# Configure the interrupt to fill the input array with the magic value.
function handle_interrupt()
upload!(source_array, fill(magic, thread_count), data_stream; async = true)
synchronize(data_stream)
end

# Run the kernel.
@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer)

# Check that the destination buffer is as expected.
@test download(Float32, destination_array, thread_count) == fill(magic, thread_count)
24 changes: 24 additions & 0 deletions examples/interrupt.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
using CUDAdrv, CUDAnative
using Test

# Define a kernel that makes the host count.
function kernel()
interrupt()
return
end

thread_count = 64

# Configure the interrupt to increment a counter.
global counter = 0
function handle_interrupt()
global counter
counter += 1
end

# Run the kernel.
@cuda_interruptible handle_interrupt threads=thread_count kernel()

# Check that the counter's final value equals the number
# of threads.
@test counter == thread_count
88 changes: 88 additions & 0 deletions examples/linked-list.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
using CUDAnative, CUDAdrv
using Test
import Base: foldl, reduce, sum

# This test constructs a linked list in a GPU kernel.

use_gc = true

abstract type List{T}
end

mutable struct Nil{T} <: List{T}
end

mutable struct Cons{T} <: List{T}
value::T
next::List{T}
end

Cons{T}(value::T) where T = Cons{T}(value, Nil{T}())

function List{T}(pointer, count::Integer) where T
result = Nil{T}()
for i in count:-1:1
result = Cons{T}(unsafe_load(pointer, i), result)
end
result
end

function foldl(op, list::List{T}; init) where T
node = list
accumulator = init
while isa(node, Cons{T})
accumulator = op(accumulator, node.value)
node = node.next
end
accumulator
end

function reduce(op, list::List{T}; init) where T
foldl(op, list; init=init)
end

function sum(list::List{T}) where T
reduce(+, list; init=zero(T))
end

const element_count = 2000
const thread_count = 32

function upload!(destination, source)
Mem.copy!(destination, pointer(source), sizeof(source))
end

function download(::Type{T}, source, dims) where T
result = Array{T}(undef, dims)
Mem.copy!(pointer(result), source, sizeof(result))
result
end

function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64})
i = (blockIdx().x-1) * blockDim().x + threadIdx().x
l = List{Int64}(elements, element_count)
unsafe_store!(results, sum(l), i)
return
end

# Allocate two arrays.
source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * element_count)
destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * thread_count)
source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)

# Fill the source and destination arrays.
upload!(source_array, Array(1:element_count))
upload!(destination_array, zeros(Int64, thread_count))

# Run the kernel.
if use_gc
@cuda gc=true threads=thread_count gc_config=GCConfiguration(; global_arena_initial_size=1024, global_arena_starvation_threshold=1024) kernel(source_pointer, destination_pointer)
stats = @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
else
@cuda threads=thread_count kernel(source_pointer, destination_pointer)
stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer)
end
println(stats)

@test download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count)
46 changes: 46 additions & 0 deletions examples/lock.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
using CUDAdrv, CUDAnative
using Test

const thread_count = Int32(128)
const total_count = Int32(1024)

# Define a kernel that atomically increments a counter using a lock.
function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.MutexState})
lock = Mutex(lock_state)
done = false
while !done && try_lock(lock)
new_count = unsafe_load(counter) + 1
unsafe_store!(counter, new_count)
if new_count == total_count
done = true
end
CUDAnative.unlock(lock)
end
return
end

function upload!(destination, source)
Mem.copy!(destination, pointer(source), sizeof(source))
end

function download(::Type{T}, source, dims) where T
result = Array{T}(undef, dims)
Mem.copy!(pointer(result), source, sizeof(result))
result
end

# Allocate memory for the counter and the lock.
counter_buf = Mem.alloc(Mem.DeviceBuffer, sizeof(Int32))
upload!(counter_buf, [Int32(0)])
counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)

lock_buf = Mem.alloc(Mem.DeviceBuffer, sizeof(CUDAnative.MutexState))
upload!(lock_buf, [CUDAnative.MutexState(0)])
lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.MutexState}, lock_buf)

# Run the kernel.
@cuda threads=thread_count increment_counter(counter_pointer, lock_pointer)

# Check that the counter's final value equals the number
# of threads.
@test download(Int32, counter_buf) == [Int32(total_count)]
Loading