Skip to content

Commit

Permalink
Merge pull request #139 from JuliaGPU/sd/bugfix_align
Browse files Browse the repository at this point in the history
bug fixes and better aligning
  • Loading branch information
SimonDanisch authored Aug 1, 2017
2 parents 64b20df + cf2dd89 commit 65bfef1
Show file tree
Hide file tree
Showing 6 changed files with 252 additions and 8 deletions.
4 changes: 4 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,11 @@ os:

julia:
- 0.5
- 0.6
- nightly
matrix:
allow_failures:
- julia: nightly

notifications:
email: false
Expand Down
1 change: 1 addition & 0 deletions src/OpenCL.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
__precompile__(true)
module OpenCL

export cl
Expand Down
63 changes: 58 additions & 5 deletions src/context.jl
Original file line number Diff line number Diff line change
@@ -1,24 +1,77 @@
# OpenCL.Context

const _ctx_reference_count = Dict{CL_context, Int}()


function create_jl_reference!(ctx_id::CL_context)
if haskey(_ctx_reference_count, ctx_id) # for the first jl reference, we already have a refcount of 1
@check api.clRetainContext(ctx_id) # increase internal refcount, if creating an additional reference
end
refcount = get!(_ctx_reference_count, ctx_id, 0)
_ctx_reference_count[ctx_id] = refcount + 1
return
end
function free_jl_reference!(ctx_id::CL_context)
if !haskey(_ctx_reference_count, ctx_id)
error("Freeing unknown context")
end
refcount = _ctx_reference_count[ctx_id]
if refcount == 0
error("Double free of context id: ", ctx_id)
elseif refcount == 1
delete!(_ctx_reference_count, ctx_id)
return
end
_ctx_reference_count[ctx_id] = refcount - 1
return
end

type Context <: CLObject
id :: CL_context

function Context(ctx_id::CL_context; retain=false)
if retain
@check api.clRetainContext(ctx_id)
# If created from ctx_id already, we need to increase the reference count
# because then we give out multiple context references with multiple finalizers to the world
# TODO should we make it in a way, that you can't overwrite it?
function Context(ctx_id::CL_context; retain = false)
retain && @check api.clRetainContext(ctx_id)
if !is_ctx_id_alive(ctx_id)
error("ctx_id not alive: ", ctx_id)
end
ctx = new(ctx_id)
create_jl_reference!(ctx_id)
finalizer(ctx, c -> begin
retain || _deletecached!(c);
if c.id != C_NULL
@check api.clReleaseContext(c.id)
release_ctx_id(c.id)
free_jl_reference!(c.id)
c.id = C_NULL
end
end )
return ctx
end
end

number_of_references(ctx::Context) = number_of_references(ctx.id)
function number_of_references(ctx_id::CL_context)
refcounts = Ref{CL_uint}()
@check api.clGetContextInfo(
ctx_id, CL_CONTEXT_REFERENCE_COUNT,
sizeof(CL_uint), refcounts, C_NULL
)
return refcounts[]
end

function is_ctx_id_alive(ctx_id::CL_context)
number_of_references(ctx_id) > 0
end
function release_ctx_id(ctx_id::CL_context)
if is_ctx_id_alive(ctx_id)
@check api.clReleaseContext(ctx_id)
else
error("Double free for context: ", ctx_id)
end
return
end

Base.pointer(ctx::Context) = ctx.id

function Base.show(io::IO, ctx::Context)
Expand Down
159 changes: 156 additions & 3 deletions src/kernel.jl
Original file line number Diff line number Diff line change
Expand Up @@ -84,14 +84,167 @@ function set_arg!(k::Kernel, idx::Integer, arg::LocalMem)
return k
end

#TODO: type safe calling of set args for kernel (with clang)

is_cl_vector{T}(x::T) = _is_cl_vector(T)
is_cl_vector{T}(x::Type{T}) = _is_cl_vector(T)
_is_cl_vector(x) = false
_is_cl_vector{N, T}(x::Type{NTuple{N, T}}) = is_cl_number(T) && N in (2, 3, 4, 8, 16)
is_cl_number{T}(x::Type{T}) = _is_cl_number(T)
is_cl_number{T}(x::T) = _is_cl_number(T)
_is_cl_number(x) = false
function _is_cl_number{T <: Union{
Int64, Int32, Int16, Int8,
UInt64, UInt32, UInt16, UInt8,
Float64, Float32, Float16
}}(::Type{T})
true
end
is_cl_inbuild{T}(x::T) = is_cl_vector(x) || is_cl_number(x)


immutable Pad{N}
val::NTuple{N, Int8}
(::Type{Pad{N}}){N}() = new{N}(ntuple(i-> Int8(0), Val{N}))
end
Base.isempty{N}(::Type{Pad{N}}) = (N == 0)
Base.isempty{N}(::Pad{N}) = N == 0


"""
OpenCL 1.2 Specs:
6.1.5 Alignment of Types
A data item declared to be a data type in memory is always aligned to the size of the data type in
bytes. For example, a float4 variable will be aligned to a 16-byte boundary, a char2 variable will
be aligned to a 2-byte boundary.
For 3-component vector data types, the size of the data type is 4 * sizeof(component). This
means that a 3-component vector data type will be aligned to a 4 * sizeof(component)
boundary. The vload3 and vstore3 built-in functions can be used to read and write, respectively,
3-component vector data types from an array of packed scalar data type.
A built-in data type that is not a power of two bytes in size must be aligned to the next larger
power of two. This rule applies to built-in types only, not structs or unions.
The OpenCL compiler is responsible for aligning data items to the appropriate alignment as
required by the data type. For arguments to a `__kernel` function declared to be a pointer to a
data type, the OpenCL compiler can assume that the pointee is always appropriately aligned as
required by the data type. The behavior of an unaligned load or store is undefined, except for the
vloadn, vload_halfn, vstoren, and vstore_halfn functions defined in section 6.12.7. The vector
load functions can read a vector from an address aligned to the element type of the vector. The
vector store functions can write a vector to an address aligned to the element type of the vector.
"""
cl_alignement(x) = cl_packed_sizeof(x)

function advance_aligned(offset, alignment)
(offset == 0 || alignment == 0) && return 0
if offset % alignment != 0
npad = ((div(offset, alignment) + 1) * alignment) - offset
offset += npad
end
offset
end


"""
Sizeof that considers OpenCL alignement. See cl_alignement
"""
function _cl_packed_sizeof{T}(::Type{T})
tsz = sizeof(T)
tsz == 0 && nfields(T) == 0 && return 4 # 0 sized types can't be defined
size = if is_cl_inbuild(T) || nfields(T) == 0
if is_cl_inbuild(T)
# inbuild sizes are all power of two!
return ispow2(tsz) ? tsz : nextpow2(tsz)
else
return tsz
end
else
size = 0
for field in fieldnames(T)
size += _cl_packed_sizeof(fieldtype(T, field))
end
return size
end
end

cl_packed_sizeof{T}(x::T) = cl_packed_sizeof(T)
Base.@generated function cl_packed_sizeof{T}(x::Type{T})
:($(_cl_packed_sizeof(T)))
end
get_typ{T}(::Type{Type{T}}) = T
"""
Converts a Julia type to conform to a `__packed__` struct in OpenCL.
If a type gets passed, it will return the converted type.
This conforms to the OpenCL 1.2 specs, section 6.11.1:
```
__packed__
This attribute, attached to struct or union type definition, specifies that each
member of the structure or union is placed to minimize the memory required. When
attached to an enum definition, it indicates that the smallest integral type should be used.
Specifying this attribute for struct and union types is equivalent to specifying
the packed attribute on each of the structure or union members.
In the following example struct my_packed_struct's members are
packed closely together, but the internal layout of its s member is not packed. To
do that, struct my_unpacked_struct would need to be packed, too.
struct my_unpacked_struct
{
char c;
int i;
};
struct __attribute__ ((packed)) my_packed_struct
{
char c;
int i;
struct my_unpacked_struct s;
};
You may only specify this attribute on the definition of a enum, struct or
union, not on a typedef which does not also define the enumerated type,
structure or union.
```
"""
@generated function packed_convert{TX}(x::TX)
elements = []; fields = []
T = x <: Type ? get_typ(x) : x
_packed_convert!(T, elements, fields, :x)
TC = Tuple{last.(elements)...}
sizeof(TC) == sizeof(T) && return :(x) # no conversion happened
if x <: Type # if is not a datatype
:($TC)
else
tupl = Expr(:tuple)
tupl.args = first.(elements)
# hoist field loads
:($(fields...); $tupl)
end
end

function _packed_convert!(x, elements = [], fields = [], fieldname = gensym(:field))
if !is_cl_inbuild(x) && nfields(x) > 0
for field in fieldnames(x)
current_field = gensym(string(field))
push!(fields, :($current_field = getfield($fieldname, $(QuoteNode(field)))))
xelem = fieldtype(x, field)
_packed_convert!(xelem, elements, fields, current_field)
end
else
push!(elements, fieldname => x)
if cl_packed_sizeof(x) > sizeof(x) # if size doesn't match, we need pads
npad = cl_packed_sizeof(x) - sizeof(x)
@assert npad > 0 # this shouldn't happen and would be a bug in cl_packed_sizeof!
push!(elements, :(Pad{$npad}()) => Pad{npad})
end
end
return elements, fields, fieldname
end

function set_arg!{T}(k::Kernel, idx::Integer, arg::T)
@assert idx > 0 "Kernel idx must be bigger 0"
if !isbits(T) # TODO add more thorough mem layout checks and the clang stuff
error("Only isbits types allowed. Found: $T")
end
boxed_arg = Ref{T}(arg)
@check api.clSetKernelArg(k.id, cl_uint(idx - 1), sizeof(T), boxed_arg)
aligned_arg = packed_convert(arg)
T_aligned = typeof(aligned_arg)
ref = Ref{T_aligned}(aligned_arg)
@check api.clSetKernelArg(k.id, cl_uint(idx - 1), cl_packed_sizeof(T), ref)
return k
end

Expand Down
17 changes: 17 additions & 0 deletions test/runtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,18 @@ using Base.Test

using OpenCL

@testset "aligned convert" begin
x = ((10f0, 1f0, 2f0), (10f0, 1f0, 2f0), (10f0, 1f0, 2f0))
x_aligned = cl.packed_convert(x)

@test x_aligned == ((10f0, 1f0, 2f0), cl.Pad{4}(), (10f0, 1f0, 2f0), cl.Pad{4}(), (10f0, 1f0, 2f0), cl.Pad{4}())
x_aligned_t = cl.packed_convert(typeof(x))
@test x_aligned_t == typeof(x_aligned)

x = cl.packed_convert(77f0)
@test x == 77f0
end

function create_test_buffer()
ctx = cl.create_some_context()
queue = cl.CmdQueue(ctx)
Expand All @@ -24,4 +36,9 @@ include("test_memory.jl")
include("test_buffer.jl")
include("test_array.jl")

@testset "context jl reference counting" begin
gc()
@test isempty(cl._ctx_reference_count)
end

end # module
16 changes: 16 additions & 0 deletions test/test_context.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,22 @@
for device in cl.devices(platform)
ctx = cl.Context(device)
@test ctx != nothing
ctx_id = ctx.id
ctx2 = cl.Context(ctx_id)
@test cl.is_ctx_id_alive(ctx_id)
@test ctx.id != C_NULL
@test ctx2.id != C_NULL
finalize(ctx)
@test ctx.id == C_NULL
@test ctx2.id != C_NULL
@test cl.is_ctx_id_alive(ctx_id)
finalize(ctx2)
@test ctx.id == C_NULL
@test ctx2.id == C_NULL
# jeez, this segfaults... WHY? I suspect a driver bug for refcount == 0?
# NVIDIA 381.22
#@test !cl.is_ctx_id_alive(ctx_id)

end
end
end
Expand Down Expand Up @@ -78,4 +93,5 @@
@test parsed_properties[2] == cl.cl_context_properties(platform.id)
end
end

end

0 comments on commit 65bfef1

Please sign in to comment.