Merge pull request #139 from JuliaGPU/sd/bugfix_align

bug fixes and better aligning
JuliaGPU · Aug 1, 2017 · 65bfef1 · 65bfef1
2 parents 64b20df + cf2dd89
commit 65bfef1
Show file tree

Hide file tree

Showing 6 changed files with 252 additions and 8 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -7,7 +7,11 @@ os:
 
 julia:
  - 0.5
+ - 0.6
  - nightly
+matrix:
+ allow_failures:
+ - julia: nightly
 
 notifications:
  email: false

diff --git a/src/OpenCL.jl b/src/OpenCL.jl
@@ -1,3 +1,4 @@
+__precompile__(true)
 module OpenCL
 
 export cl

diff --git a/src/context.jl b/src/context.jl
@@ -1,24 +1,77 @@
 # OpenCL.Context
 
+const _ctx_reference_count = Dict{CL_context, Int}()
+
+
+function create_jl_reference!(ctx_id::CL_context)
+ if haskey(_ctx_reference_count, ctx_id) # for the first jl reference, we already have a refcount of 1
+ @check api.clRetainContext(ctx_id) # increase internal refcount, if creating an additional reference
+ end
+ refcount = get!(_ctx_reference_count, ctx_id, 0)
+ _ctx_reference_count[ctx_id] = refcount + 1
+ return
+end
+function free_jl_reference!(ctx_id::CL_context)
+ if !haskey(_ctx_reference_count, ctx_id)
+ error("Freeing unknown context")
+ end
+ refcount = _ctx_reference_count[ctx_id]
+ if refcount == 0
+ error("Double free of context id: ", ctx_id)
+ elseif refcount == 1
+ delete!(_ctx_reference_count, ctx_id)
+ return
+ end
+ _ctx_reference_count[ctx_id] = refcount - 1
+ return
+end
+
 type Context <: CLObject
  id :: CL_context
-
- function Context(ctx_id::CL_context; retain=false)
- if retain
- @check api.clRetainContext(ctx_id)
+ # If created from ctx_id already, we need to increase the reference count
+ # because then we give out multiple context references with multiple finalizers to the world
+ # TODO should we make it in a way, that you can't overwrite it?
+ function Context(ctx_id::CL_context; retain = false)
+ retain && @check api.clRetainContext(ctx_id)
+ if !is_ctx_id_alive(ctx_id)
+ error("ctx_id not alive: ", ctx_id)
  end
  ctx = new(ctx_id)
+ create_jl_reference!(ctx_id)
  finalizer(ctx, c -> begin
  retain || _deletecached!(c);
  if c.id != C_NULL
- @check api.clReleaseContext(c.id)
+ release_ctx_id(c.id)
+ free_jl_reference!(c.id)
  c.id = C_NULL
  end
  end )
  return ctx
  end
 end
 
+number_of_references(ctx::Context) = number_of_references(ctx.id)
+function number_of_references(ctx_id::CL_context)
+ refcounts = Ref{CL_uint}()
+ @check api.clGetContextInfo(
+ ctx_id, CL_CONTEXT_REFERENCE_COUNT,
+ sizeof(CL_uint), refcounts, C_NULL
+ )
+ return refcounts[]
+end
+
+function is_ctx_id_alive(ctx_id::CL_context)
+ number_of_references(ctx_id) > 0
+end
+function release_ctx_id(ctx_id::CL_context)
+ if is_ctx_id_alive(ctx_id)
+ @check api.clReleaseContext(ctx_id)
+ else
+ error("Double free for context: ", ctx_id)
+ end
+ return
+end
+
 Base.pointer(ctx::Context) = ctx.id
 
 function Base.show(io::IO, ctx::Context)

diff --git a/src/kernel.jl b/src/kernel.jl
@@ -84,14 +84,167 @@ function set_arg!(k::Kernel, idx::Integer, arg::LocalMem)
  return k
 end
 
-#TODO: type safe calling of set args for kernel (with clang)
+
+is_cl_vector{T}(x::T) = _is_cl_vector(T)
+is_cl_vector{T}(x::Type{T}) = _is_cl_vector(T)
+_is_cl_vector(x) = false
+_is_cl_vector{N, T}(x::Type{NTuple{N, T}}) = is_cl_number(T) && N in (2, 3, 4, 8, 16)
+is_cl_number{T}(x::Type{T}) = _is_cl_number(T)
+is_cl_number{T}(x::T) = _is_cl_number(T)
+_is_cl_number(x) = false
+function _is_cl_number{T <: Union{
+ Int64, Int32, Int16, Int8,
+ UInt64, UInt32, UInt16, UInt8,
+ Float64, Float32, Float16
+ }}(::Type{T})
+ true
+end
+is_cl_inbuild{T}(x::T) = is_cl_vector(x) || is_cl_number(x)
+
+
+immutable Pad{N}
+ val::NTuple{N, Int8}
+ (::Type{Pad{N}}){N}() = new{N}(ntuple(i-> Int8(0), Val{N}))
+end
+Base.isempty{N}(::Type{Pad{N}}) = (N == 0)
+Base.isempty{N}(::Pad{N}) = N == 0
+
+
+"""
+OpenCL 1.2 Specs:
+6.1.5 Alignment of Types
+A data item declared to be a data type in memory is always aligned to the size of the data type in
+bytes. For example, a float4 variable will be aligned to a 16-byte boundary, a char2 variable will
+be aligned to a 2-byte boundary.
+For 3-component vector data types, the size of the data type is 4 * sizeof(component). This
+means that a 3-component vector data type will be aligned to a 4 * sizeof(component)
+boundary. The vload3 and vstore3 built-in functions can be used to read and write, respectively,
+3-component vector data types from an array of packed scalar data type.
+A built-in data type that is not a power of two bytes in size must be aligned to the next larger
+power of two. This rule applies to built-in types only, not structs or unions.
+The OpenCL compiler is responsible for aligning data items to the appropriate alignment as
+required by the data type. For arguments to a `__kernel` function declared to be a pointer to a
+data type, the OpenCL compiler can assume that the pointee is always appropriately aligned as
+required by the data type. The behavior of an unaligned load or store is undefined, except for the
+vloadn, vload_halfn, vstoren, and vstore_halfn functions defined in section 6.12.7. The vector
+load functions can read a vector from an address aligned to the element type of the vector. The
+vector store functions can write a vector to an address aligned to the element type of the vector.
+"""
+cl_alignement(x) = cl_packed_sizeof(x)
+
+function advance_aligned(offset, alignment)
+ (offset == 0 || alignment == 0) && return 0
+ if offset % alignment != 0
+ npad = ((div(offset, alignment) + 1) * alignment) - offset
+ offset += npad
+ end
+ offset
+end
+
+
+"""
+Sizeof that considers OpenCL alignement. See cl_alignement
+"""
+function _cl_packed_sizeof{T}(::Type{T})
+ tsz = sizeof(T)
+ tsz == 0 && nfields(T) == 0 && return 4 # 0 sized types can't be defined
+ size = if is_cl_inbuild(T) || nfields(T) == 0
+ if is_cl_inbuild(T)
+ # inbuild sizes are all power of two!
+ return ispow2(tsz) ? tsz : nextpow2(tsz)
+ else
+ return tsz
+ end
+ else
+ size = 0
+ for field in fieldnames(T)
+ size += _cl_packed_sizeof(fieldtype(T, field))
+ end
+ return size
+ end
+end
+
+cl_packed_sizeof{T}(x::T) = cl_packed_sizeof(T)
+Base.@generated function cl_packed_sizeof{T}(x::Type{T})
+ :($(_cl_packed_sizeof(T)))
+end
+get_typ{T}(::Type{Type{T}}) = T
+"""
+Converts a Julia type to conform to a `__packed__` struct in OpenCL.
+If a type gets passed, it will return the converted type.
+This conforms to the OpenCL 1.2 specs, section 6.11.1:
+```
+ __packed__
+ This attribute, attached to struct or union type definition, specifies that each
+ member of the structure or union is placed to minimize the memory required. When
+ attached to an enum definition, it indicates that the smallest integral type should be used.
+ Specifying this attribute for struct and union types is equivalent to specifying
+ the packed attribute on each of the structure or union members.
+ In the following example struct my_packed_struct's members are
+ packed closely together, but the internal layout of its s member is not packed. To
+ do that, struct my_unpacked_struct would need to be packed, too.
+ struct my_unpacked_struct
+ {
+ char c;
+ int i;
+ };
+
+ struct __attribute__ ((packed)) my_packed_struct
+ {
+ char c;
+ int i;
+ struct my_unpacked_struct s;
+ };
+
+ You may only specify this attribute on the definition of a enum, struct or
+ union, not on a typedef which does not also define the enumerated type,
+ structure or union.
+```
+"""
+@generated function packed_convert{TX}(x::TX)
+ elements = []; fields = []
+ T = x <: Type ? get_typ(x) : x
+ _packed_convert!(T, elements, fields, :x)
+ TC = Tuple{last.(elements)...}
+ sizeof(TC) == sizeof(T) && return :(x) # no conversion happened
+ if x <: Type # if is not a datatype
+ :($TC)
+ else
+ tupl = Expr(:tuple)
+ tupl.args = first.(elements)
+ # hoist field loads
+ :($(fields...); $tupl)
+ end
+end
+
+function _packed_convert!(x, elements = [], fields = [], fieldname = gensym(:field))
+ if !is_cl_inbuild(x) && nfields(x) > 0
+ for field in fieldnames(x)
+ current_field = gensym(string(field))
+ push!(fields, :($current_field = getfield($fieldname, $(QuoteNode(field)))))
+ xelem = fieldtype(x, field)
+ _packed_convert!(xelem, elements, fields, current_field)
+ end
+ else
+ push!(elements, fieldname => x)
+ if cl_packed_sizeof(x) > sizeof(x) # if size doesn't match, we need pads
+ npad = cl_packed_sizeof(x) - sizeof(x)
+ @assert npad > 0 # this shouldn't happen and would be a bug in cl_packed_sizeof!
+ push!(elements, :(Pad{$npad}()) => Pad{npad})
+ end
+ end
+ return elements, fields, fieldname
+end
+
 function set_arg!{T}(k::Kernel, idx::Integer, arg::T)
  @assert idx > 0 "Kernel idx must be bigger 0"
  if !isbits(T) # TODO add more thorough mem layout checks and the clang stuff
  error("Only isbits types allowed. Found: $T")
  end
- boxed_arg = Ref{T}(arg)
- @check api.clSetKernelArg(k.id, cl_uint(idx - 1), sizeof(T), boxed_arg)
+ aligned_arg = packed_convert(arg)
+ T_aligned = typeof(aligned_arg)
+ ref = Ref{T_aligned}(aligned_arg)
+ @check api.clSetKernelArg(k.id, cl_uint(idx - 1), cl_packed_sizeof(T), ref)
  return k
 end
 

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -3,6 +3,18 @@ using Base.Test
 
 using OpenCL
 
+@testset "aligned convert" begin
+ x = ((10f0, 1f0, 2f0), (10f0, 1f0, 2f0), (10f0, 1f0, 2f0))
+ x_aligned = cl.packed_convert(x)
+
+ @test x_aligned == ((10f0, 1f0, 2f0), cl.Pad{4}(), (10f0, 1f0, 2f0), cl.Pad{4}(), (10f0, 1f0, 2f0), cl.Pad{4}())
+ x_aligned_t = cl.packed_convert(typeof(x))
+ @test x_aligned_t == typeof(x_aligned)
+
+ x = cl.packed_convert(77f0)
+ @test x == 77f0
+end
+
 function create_test_buffer()
  ctx = cl.create_some_context()
  queue = cl.CmdQueue(ctx)
@@ -24,4 +36,9 @@ include("test_memory.jl")
 include("test_buffer.jl")
 include("test_array.jl")
 
+@testset "context jl reference counting" begin
+ gc()
+ @test isempty(cl._ctx_reference_count)
+end
+
 end # module
diff --git a/test/test_context.jl b/test/test_context.jl
@@ -5,7 +5,22 @@
  for device in cl.devices(platform)
  ctx = cl.Context(device)
  @test ctx != nothing
+ ctx_id = ctx.id
+ ctx2 = cl.Context(ctx_id)
+ @test cl.is_ctx_id_alive(ctx_id)
+ @test ctx.id != C_NULL
+ @test ctx2.id != C_NULL
  finalize(ctx)
+ @test ctx.id == C_NULL
+ @test ctx2.id != C_NULL
+ @test cl.is_ctx_id_alive(ctx_id)
+ finalize(ctx2)
+ @test ctx.id == C_NULL
+ @test ctx2.id == C_NULL
+ # jeez, this segfaults... WHY? I suspect a driver bug for refcount == 0?
+ # NVIDIA 381.22
+ #@test !cl.is_ctx_id_alive(ctx_id)
+
  end
  end
  end
@@ -78,4 +93,5 @@
  @test parsed_properties[2] == cl.cl_context_properties(platform.id)
  end
  end
+
 end