Skip to content

Commit

Permalink
memcpy_peer
Browse files Browse the repository at this point in the history
  • Loading branch information
lukstafi committed Jul 5, 2024
1 parent c2c79c1 commit 5a41279
Show file tree
Hide file tree
Showing 3 changed files with 56 additions and 42 deletions.
6 changes: 5 additions & 1 deletion CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@
### Added

- Support for streams (except `cuStreamWaitEvent` and graph capture).
- Support for asynchronous copying.
- Support for asynchronous copying, including `cuMemcpyPeerAsync`.

### Changed

- Renamed `byte_size` to `size_in_bytes`.

## [0.2.0] 2024-05-18

Expand Down
10 changes: 10 additions & 0 deletions cuda_ffi/bindings.ml
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,16 @@ module Functions (F : Ctypes.FOREIGN) = struct
F.foreign "cuMemcpyDtoDAsync"
F.(cu_deviceptr @-> cu_deviceptr @-> size_t @-> cu_stream @-> returning E.cu_result)

let cu_memcpy_peer =
F.foreign "cuMemcpyPeer"
F.(cu_deviceptr @-> cu_context @-> cu_deviceptr @-> cu_context @-> size_t @-> returning E.cu_result)

let cu_memcpy_peer_async =
F.foreign "cuMemcpyPeerAsync"
F.(
cu_deviceptr @-> cu_context @-> cu_deviceptr @-> cu_context @-> size_t @-> cu_stream
@-> returning E.cu_result)

let cu_ctx_disable_peer_access = F.foreign "cuCtxDisablePeerAccess" F.(cu_context @-> returning E.cu_result)

let cu_ctx_enable_peer_access =
Expand Down
82 changes: 41 additions & 41 deletions cudajit.ml
Original file line number Diff line number Diff line change
Expand Up @@ -311,10 +311,10 @@ let module_get_function module_ ~name =
type deviceptr =
| Deviceptr of Unsigned.uint64 (** A pointer to an array on a device. (Not a pointer to a device!) *)

let mem_alloc ~byte_size =
let mem_alloc ~size_in_bytes =
let open Ctypes in
let deviceptr = allocate_n cu_deviceptr ~count:1 in
check "cu_mem_alloc" @@ Cuda.cu_mem_alloc deviceptr @@ Unsigned.Size_t.of_int byte_size;
check "cu_mem_alloc" @@ Cuda.cu_mem_alloc deviceptr @@ Unsigned.Size_t.of_int size_in_bytes;
Deviceptr !@deviceptr

let memcpy_H_to_D_impl ?host_offset ?length ~dst ~src memcpy =
Expand All @@ -338,8 +338,8 @@ let memcpy_H_to_D ?host_offset ?length ~dst:(Deviceptr dst) ~src () =
memcpy_H_to_D_impl ?host_offset ?length ~dst:(Deviceptr dst) ~src memcpy_H_to_D_unsafe

let alloc_and_memcpy src =
let byte_size = Bigarray.Genarray.size_in_bytes src in
let dst = mem_alloc ~byte_size in
let size_in_bytes = Bigarray.Genarray.size_in_bytes src in
let dst = mem_alloc ~size_in_bytes in
memcpy_H_to_D ~dst ~src ();
dst

Expand Down Expand Up @@ -411,44 +411,44 @@ let memcpy_D_to_H_async_unsafe ~(dst : unit Ctypes.ptr) ~src:(Deviceptr src) ~si
let memcpy_D_to_H_async ?host_offset ?length ~dst ~src =
memcpy_D_to_H_impl ?host_offset ?length ~dst ~src memcpy_D_to_H_async_unsafe

(** Provide either both [kind] and [length], or just [byte_size]. *)
let get_size_in_bytes ?kind ?length ?size_in_bytes provenance =
match (size_in_bytes, kind, length) with
| Some size, None, None -> size
| None, Some kind, Some length ->
let c_typ = Ctypes.typ_of_bigarray_kind kind in
let elem_bytes = Ctypes.sizeof c_typ in
elem_bytes * length
| Some _, Some _, Some _ ->
invalid_arg @@ provenance
^ ": Too many arguments, provide either both [kind] and [length], or just [size_in_bytes]."
| _ ->
invalid_arg @@ provenance
^ ": Too few arguments, provide either both [kind] and [length], or just [size_in_bytes]."

(** Provide either both [kind] and [length], or just [size_in_bytes]. *)
let memcpy_D_to_D ?kind ?length ?size_in_bytes ~dst:(Deviceptr dst) ~src:(Deviceptr src) () =
let byte_size =
match (size_in_bytes, kind, length) with
| Some size, None, None -> size
| None, Some kind, Some length ->
let c_typ = Ctypes.typ_of_bigarray_kind kind in
let elem_bytes = Ctypes.sizeof c_typ in
elem_bytes * length
| Some _, Some _, Some _ ->
invalid_arg
"memcpy_D_to_D: Too many arguments, provide either both [kind] and [length], or just [byte_size]."
| _ ->
invalid_arg
"memcpy_D_to_D: Too few arguments, provide either both [kind] and [length], or just [byte_size]."
in
check "cu_memcpy_D_to_D" @@ Cuda.cu_memcpy_D_to_D dst src @@ Unsigned.Size_t.of_int byte_size
let size_in_bytes = get_size_in_bytes ?kind ?length ?size_in_bytes "memcpy_D_to_D" in
check "cu_memcpy_D_to_D" @@ Cuda.cu_memcpy_D_to_D dst src @@ Unsigned.Size_t.of_int size_in_bytes

(** Provide either both [kind] and [length], or just [byte_size]. *)
(** Provide either both [kind] and [length], or just [size_in_bytes]. *)
let memcpy_D_to_D_async ?kind ?length ?size_in_bytes ~dst:(Deviceptr dst) ~src:(Deviceptr src) stream =
let byte_size =
match (size_in_bytes, kind, length) with
| Some size, None, None -> size
| None, Some kind, Some length ->
let c_typ = Ctypes.typ_of_bigarray_kind kind in
let elem_bytes = Ctypes.sizeof c_typ in
elem_bytes * length
| Some _, Some _, Some _ ->
invalid_arg
"memcpy_D_to_D_async: Too many arguments, provide either both [kind] and [length], or just \
[byte_size]."
| _ ->
invalid_arg
"memcpy_D_to_D_async: Too few arguments, provide either both [kind] and [length], or just \
[byte_size]."
in
let size_in_bytes = get_size_in_bytes ?kind ?length ?size_in_bytes "memcpy_D_to_D_async" in
check "cu_memcpy_D_to_D_async"
@@ Cuda.cu_memcpy_D_to_D_async dst src (Unsigned.Size_t.of_int byte_size) stream
@@ Cuda.cu_memcpy_D_to_D_async dst src (Unsigned.Size_t.of_int size_in_bytes) stream

(** Provide either both [kind] and [length], or just [size_in_bytes]. *)
let memcpy_peer ?kind ?length ?size_in_bytes ~dst:(Deviceptr dst) ~dst_ctx ~src:(Deviceptr src) ~src_ctx () =
let size_in_bytes = get_size_in_bytes ?kind ?length ?size_in_bytes "memcpy_peer" in
check "cu_memcpy_peer"
@@ Cuda.cu_memcpy_peer dst dst_ctx src src_ctx
@@ Unsigned.Size_t.of_int size_in_bytes

(** Provide either both [kind] and [length], or just [size_in_bytes]. *)
let memcpy_peer_async ?kind ?length ?size_in_bytes ~dst:(Deviceptr dst) ~dst_ctx ~src:(Deviceptr src) ~src_ctx
stream =
let size_in_bytes = get_size_in_bytes ?kind ?length ?size_in_bytes "memcpy_peer_async" in
check "cu_memcpy_peer_async"
@@ Cuda.cu_memcpy_peer_async dst dst_ctx src src_ctx (Unsigned.Size_t.of_int size_in_bytes) stream

(** Disables peer access between the current context and the given context. *)
let ctx_disable_peer_access ctx = check "cu_ctx_disable_peer_access" @@ Cuda.cu_ctx_disable_peer_access ctx
Expand Down Expand Up @@ -502,9 +502,9 @@ let memset_d32 (Deviceptr dev) v ~length =
let module_get_global module_ ~name =
let open Ctypes in
let device = allocate_n cu_deviceptr ~count:1 in
let byte_size = allocate size_t Unsigned.Size_t.zero in
check "cu_module_get_global" @@ Cuda.cu_module_get_global device byte_size module_ name;
(Deviceptr !@device, !@byte_size)
let size_in_bytes = allocate size_t Unsigned.Size_t.zero in
check "cu_module_get_global" @@ Cuda.cu_module_get_global device size_in_bytes module_ name;
(Deviceptr !@device, !@size_in_bytes)

type device_attributes = {
name : string;
Expand Down

0 comments on commit 5a41279

Please sign in to comment.