Skip to content

Commit

Permalink
fix for CuArray and revised speedtest
Browse files Browse the repository at this point in the history
  • Loading branch information
RainerHeintzmann committed Jul 21, 2024
1 parent 83a3412 commit 2255897
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 11 deletions.
3 changes: 2 additions & 1 deletion src/general.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,9 @@ function calculate_separables_nokw(::Type{AT}, fct, sz::NTuple{N, Int},
# allocate a contigous memory to be as cash-efficient as possible and dice it up below
res = ntuple((d) -> reorient((@view all_axes[1+sum(sz[1:d])-sz[d]:sum(sz[1:d])]), Val(d), Val(N)), Val(N)) # Vector{AT}()

# below the cast of the indices is needed to make CuArrays work
toreturn = ntuple((d) ->
in_place_assing!(res, d, fct, get_1d_ids(d, sz, offset, scale), sz[d], arg_n(d, args, RT))
in_place_assing!(res, d, fct, real_arr_type(AT, Val(1))(get_1d_ids(d, sz, offset, scale)), sz[d], arg_n(d, args, RT))
, Val(N))
return toreturn
# return res
Expand Down
20 changes: 10 additions & 10 deletions test/speedtests.jl
Original file line number Diff line number Diff line change
Expand Up @@ -20,28 +20,27 @@ function speedt_test()

res2 = similar(res);
ress = gaussian_sep(sz, sigma=sigma, offset=offset);
@btime $ress = gaussian_sep($sz, sigma=$sigma, offset=$offset); # 12.3 µs
resns = gaussian_nokw_sep(sz, offset, 1f0, 1f0, sigma);
@btime $resns = gaussian_nokw_sep($sz, $offset, 1f0, 1f0, $sigma); # 12 µs
@btime $ress = gaussian_sep($sz, sigma=$sigma, offset=$offset); # 8 µs
resns = gaussian_nokw_sep(sz, offset, 1f0, sigma);
@btime $resns = gaussian_nokw_sep($sz, $offset, 1f0, $sigma); # 8 µs
res2 .= ress;
res2 res
@btime $res2 .= $ress; # 8.35 ms
@btime $res2 = similar($res); # 0.04 ms
@btime $res2 .= $ress; # 8.9 ms
@btime $res2 = similar($res); # 8 µs
@btime $res2 .= gaussian_sep($sz, sigma=$sigma, offset=$offset); # 8.4 ms

# res3 = gaussian_col(sz, sigma=sigma, offset=offset);
res3 = gaussian_col(sz, sigma=sigma, offset=offset);
t_col = @btime $res3 = gaussian_col($sz, sigma=$sigma, offset=$offset); # 14 ms

@btime $res2 .= SeparableFunctions.gaussian_lz($sz, sigma=$sigma, offset=$offset); # 8.47 ms

resc = CuArray(res);
res3c = gaussian_col(typeof(resc), sz, sigma=sigma, offset=offset); #
@btime CUDA.@sync $res3 = gaussian_col(typeof(resc), $sz, sigma=$sigma, offset=$offset); # 0.983 ms
@btime CUDA.@sync $res3c = gaussian_col(typeof(resc), $sz, sigma=$sigma, offset=$offset); # 1.06 ms

ids = CuArray(CartesianIndices(sz))
resc = get_exp.(ids, Ref(sigma), Ref(offset));
@btime CUDA.@sync $resc = get_exp.($ids, Ref($sigma)); # 2.83 ms

@btime CUDA.@sync $resc = get_exp.($ids, Ref($sigma), Ref(offset)); # 9.5 ms

t_in_place = @belapsed get_exp.(CartesianIndices($sz), Ref($sigma), Ref(offset)); # 47.7 ms, but 243 ms with offset (7 allocations, 64 Mb)!
t_gaussian_col = @belapsed $res3 = gaussian_col($sz, sigma=$sigma, offset=$offset)
Expand All @@ -52,7 +51,7 @@ function speedt_test()
t_gaussian_sep = @belapsed res_gs = gaussian_sep($sz, sigma=$sigma, offset=$offset)


tc_get_exp = @belapsed CUDA.@sync $resc = get_exp.($ids, Ref($sigma))
tc_get_exp = @belapsed CUDA.@sync $resc = get_exp.($ids, Ref($sigma), Ref($offset))
tc_gaussian_col = @belapsed CUDA.@sync $res3 = gaussian_col(typeof(resc), $sz, sigma=$sigma, offset=$offset)

# NOT working: resc .= SeparableFunctions.gaussian_lz(typeof(resc), sz, sigma=sigma, offset=offset)
Expand Down Expand Up @@ -88,6 +87,7 @@ function speedt_test()
g(x) = cis(sqrt(max(0f0, 0.25f0 - x)) * 12.566371f0)

myrr2 = collect(rr2_sep(sz; scale=scale))
res = g.(rr2_sep(sz; scale=scale))
@time res .= g.(rr2_sep(sz; scale=scale)); # 11.7 kB
t_no_rad = @belapsed res .= g.($myrr2); # 7 ms

Expand Down

0 comments on commit 2255897

Please sign in to comment.