JuliaGPU · jonathanvdc · Feb 21, 2019 · Feb 22, 2019 · Feb 22, 2019 · Feb 27, 2019
diff --git a/examples/gc.jl b/examples/gc.jl
@@ -0,0 +1,57 @@
+using CUDAdrv, CUDAnative
+using Test
+
+mutable struct TempStruct
+    data::Float32
+end
+
+@noinline function escape(val)
+    Base.pointer_from_objref(val)
+end
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+# Define a kernel that copies values using a temporary struct.
+function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+    i = (blockIdx().x - 1) * blockDim().x + threadIdx().x
+
+    for j in 1:2
+        # Allocate a mutable struct and make sure it ends up on the GC heap.
+        temp = TempStruct(unsafe_load(a, i))
+        escape(temp)
+
+        # Allocate a large garbage buffer to force collections.
+        gc_malloc(Csize_t(256 * 1024))
+
+        # Use the mutable struct. If its memory has been reclaimed (by accident)
+        # then we expect the test at the end of this file to fail.
+        unsafe_store!(b, temp.data, i)
+    end
+
+    return
+end
+
+thread_count = 256
+
+# Allocate two arrays.
+source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
+destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+# Fill the source and destination arrays.
+upload!(source_array, fill(42.f0, thread_count))
+upload!(destination_array, zeros(Float32, thread_count))
+
+# Run the kernel.
+@cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
+
+@test download(Float32, destination_array, thread_count) == fill(42.f0, thread_count)
diff --git a/examples/interrupt-memory.jl b/examples/interrupt-memory.jl
@@ -0,0 +1,54 @@
+using CUDAdrv, CUDAnative
+using Test
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+# Define a kernel that copies some data from one array to another.
+# The host is invoked to populate the source array.
+function kernel(a::CUDAnative.DevicePtr{Float32}, b::CUDAnative.DevicePtr{Float32})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    interrupt_or_wait()
+    threadfence_system()
+    Base.unsafe_store!(b, Base.unsafe_load(a, i), i)
+    return
+end
+
+thread_count = 64
+
+# Allocate two arrays.
+source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
+destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Float32) * thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Float32}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Float32}, destination_array)
+
+# Zero-fill the source and destination arrays.
+upload!(source_array, zeros(Float32, thread_count))
+upload!(destination_array, zeros(Float32, thread_count))
+
+# Define one stream for kernel execution and another for
+# data transfer.
+data_stream = CuStream()
+exec_stream = CuStream()
+
+# Define a magic value.
+magic = 42.f0
+
+# Configure the interrupt to fill the input array with the magic value.
+function handle_interrupt()
+    upload!(source_array, fill(magic, thread_count), data_stream; async = true)
+    synchronize(data_stream)
+end
+
+# Run the kernel.
+@cuda_interruptible handle_interrupt threads=thread_count stream=exec_stream kernel(source_pointer, destination_pointer)
+
+# Check that the destination buffer is as expected.
+@test download(Float32, destination_array, thread_count) == fill(magic, thread_count)
diff --git a/examples/interrupt.jl b/examples/interrupt.jl
@@ -0,0 +1,24 @@
+using CUDAdrv, CUDAnative
+using Test
+
+# Define a kernel that makes the host count.
+function kernel()
+    interrupt()
+    return
+end
+
+thread_count = 64
+
+# Configure the interrupt to increment a counter.
+global counter = 0
+function handle_interrupt()
+    global counter
+    counter += 1
+end
+
+# Run the kernel.
+@cuda_interruptible handle_interrupt threads=thread_count kernel()
+
+# Check that the counter's final value equals the number
+# of threads.
+@test counter == thread_count
diff --git a/examples/linked-list.jl b/examples/linked-list.jl
@@ -0,0 +1,88 @@
+using CUDAnative, CUDAdrv
+using Test
+import Base: foldl, reduce, sum
+
+# This test constructs a linked list in a GPU kernel.
+
+use_gc = true
+
+abstract type List{T}
+end
+
+mutable struct Nil{T} <: List{T}
+end
+
+mutable struct Cons{T} <: List{T}
+    value::T
+    next::List{T}
+end
+
+Cons{T}(value::T) where T = Cons{T}(value, Nil{T}())
+
+function List{T}(pointer, count::Integer) where T
+    result = Nil{T}()
+    for i in count:-1:1
+        result = Cons{T}(unsafe_load(pointer, i), result)
+    end
+    result
+end
+
+function foldl(op, list::List{T}; init) where T
+    node = list
+    accumulator = init
+    while isa(node, Cons{T})
+        accumulator = op(accumulator, node.value)
+        node = node.next
+    end
+    accumulator
+end
+
+function reduce(op, list::List{T}; init) where T
+    foldl(op, list; init=init)
+end
+
+function sum(list::List{T}) where T
+    reduce(+, list; init=zero(T))
+end
+
+const element_count = 2000
+const thread_count = 32
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+function kernel(elements::CUDAnative.DevicePtr{Int64}, results::CUDAnative.DevicePtr{Int64})
+    i = (blockIdx().x-1) * blockDim().x + threadIdx().x
+    l = List{Int64}(elements, element_count)
+    unsafe_store!(results, sum(l), i)
+    return
+end
+
+# Allocate two arrays.
+source_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * element_count)
+destination_array = Mem.alloc(Mem.DeviceBuffer, sizeof(Int64) * thread_count)
+source_pointer = Base.unsafe_convert(CuPtr{Int64}, source_array)
+destination_pointer = Base.unsafe_convert(CuPtr{Int64}, destination_array)
+
+# Fill the source and destination arrays.
+upload!(source_array, Array(1:element_count))
+upload!(destination_array, zeros(Int64, thread_count))
+
+# Run the kernel.
+if use_gc
+    @cuda gc=true threads=thread_count gc_config=GCConfiguration(; global_arena_initial_size=1024, global_arena_starvation_threshold=1024) kernel(source_pointer, destination_pointer)
+    stats = @cuda gc=true threads=thread_count kernel(source_pointer, destination_pointer)
+else
+    @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+    stats = CUDAdrv.@elapsed @cuda threads=thread_count kernel(source_pointer, destination_pointer)
+end
+println(stats)
+
+@test download(Int64, destination_array, thread_count) == repeat([sum(1:element_count)], thread_count)
diff --git a/examples/lock.jl b/examples/lock.jl
@@ -0,0 +1,46 @@
+using CUDAdrv, CUDAnative
+using Test
+
+const thread_count = Int32(128)
+const total_count = Int32(1024)
+
+# Define a kernel that atomically increments a counter using a lock.
+function increment_counter(counter::CUDAnative.DevicePtr{Int32}, lock_state::CUDAnative.DevicePtr{CUDAnative.MutexState})
+    lock = Mutex(lock_state)
+    done = false
+    while !done && try_lock(lock)
+        new_count = unsafe_load(counter) + 1
+        unsafe_store!(counter, new_count)
+        if new_count == total_count
+            done = true
+        end
+        CUDAnative.unlock(lock)
+    end
+    return
+end
+
+function upload!(destination, source)
+    Mem.copy!(destination, pointer(source), sizeof(source))
+end
+
+function download(::Type{T}, source, dims) where T
+    result = Array{T}(undef, dims)
+    Mem.copy!(pointer(result), source, sizeof(result))
+    result
+end
+
+# Allocate memory for the counter and the lock.
+counter_buf = Mem.alloc(Mem.DeviceBuffer, sizeof(Int32))
+upload!(counter_buf, [Int32(0)])
+counter_pointer = Base.unsafe_convert(CuPtr{Int32}, counter_buf)
+
+lock_buf = Mem.alloc(Mem.DeviceBuffer, sizeof(CUDAnative.MutexState))
+upload!(lock_buf, [CUDAnative.MutexState(0)])
+lock_pointer = Base.unsafe_convert(CuPtr{CUDAnative.MutexState}, lock_buf)
+
+# Run the kernel.
+@cuda threads=thread_count increment_counter(counter_pointer, lock_pointer)
+
+# Check that the counter's final value equals the number
+# of threads.
+@test download(Int32, counter_buf) == [Int32(total_count)]