diff --git a/CHANGES.md b/CHANGES.md index 5ff8b45..85ff392 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,3 +1,14 @@ +## [0.3.1] 2024-07-20 + +### Added + +- TODO: Previously commented out parts, that require a newer version of the CUDA API. + +### Fixed + +- A major design bug, exacerbated by the asynchronous functionaliy of v0.3 -- functions performing asynchronous calls should keep the call arguments alive; the user should only forget (or free) the arguments after the calls complete (e.g. after synchronizing a stream). + - Only `launch_kernel` needed fixing as I don't think other functions allocate passed arguments. + ## [0.3.0] 2024-07-05 ### Added diff --git a/cudajit.ml b/cudajit.ml index 221d552..174f3b8 100644 --- a/cudajit.ml +++ b/cudajit.ml @@ -308,6 +308,8 @@ let module_get_function module_ ~name = check "cu_module_get_function" @@ Cuda.cu_module_get_function func module_ name; !@func +type lifetime = Remember : 'a -> lifetime + type deviceptr = | Deviceptr of Unsigned.uint64 (** A pointer to an array on a device. (Not a pointer to a device!) *) @@ -363,7 +365,7 @@ let launch_kernel func ~grid_dim_x ?(grid_dim_y = 1) ?(grid_dim_z = 1) ~block_di ?(block_dim_z = 1) ~shared_mem_bytes stream kernel_params = let i2u = Unsigned.UInt.of_int in let open Ctypes in - let c_kernel_params = + let kernel_params = List.map (function | Tensor (Deviceptr dev) -> coerce (ptr uint64_t) (ptr void) @@ allocate uint64_t dev @@ -372,13 +374,13 @@ let launch_kernel func ~grid_dim_x ?(grid_dim_y = 1) ?(grid_dim_z = 1) ~block_di | Single u -> coerce (ptr float) (ptr void) @@ allocate float u | Double u -> coerce (ptr double) (ptr void) @@ allocate double u) kernel_params - |> CArray.of_list (ptr void) - |> CArray.start in + let c_kernel_params = kernel_params |> CArray.of_list (ptr void) in check "cu_launch_kernel" @@ Cuda.cu_launch_kernel func (i2u grid_dim_x) (i2u grid_dim_y) (i2u grid_dim_z) (i2u block_dim_x) - (i2u block_dim_y) (i2u block_dim_z) (i2u shared_mem_bytes) stream c_kernel_params - @@ coerce (ptr void) (ptr @@ ptr void) null + (i2u block_dim_y) (i2u block_dim_z) (i2u shared_mem_bytes) stream (CArray.start c_kernel_params) + @@ coerce (ptr void) (ptr @@ ptr void) null; + Remember (kernel_params, c_kernel_params) let ctx_synchronize () = check "cu_ctx_synchronize" @@ Cuda.cu_ctx_synchronize () @@ -510,6 +512,7 @@ let memset_d32_async (Deviceptr dev) v ~length stream = let module_get_global module_ ~name = let open Ctypes in + (* FIXME: here and elsewhere -- we are leaking memory from [allocate_n] and [allocate] I think? *) let device = allocate_n cu_deviceptr ~count:1 in let size_in_bytes = allocate size_t Unsigned.Size_t.zero in check "cu_module_get_global" @@ Cuda.cu_module_get_global device size_in_bytes module_ name;