From 6d41b754440e0a656aa8f2acebb1054c2db7ea5c Mon Sep 17 00:00:00 2001 From: Lukasz Stafiniak Date: Mon, 30 Dec 2024 21:28:04 +0100 Subject: [PATCH] Automated from_host transfers --- CHANGES.md | 1 + arrayjit/lib/backends.ml | 7 ++++++- arrayjit/lib/tnode.ml | 6 +++++- bin/compilation_speed.ml | 1 - bin/hello_world.ml | 2 -- bin/micrograd_demo.ml | 3 --- bin/moons_demo.ml | 3 --- bin/zero2hero_1of7.ml | 7 ------- lib/train.ml | 14 -------------- test/micrograd_demo.ml | 11 ----------- test/zero2hero_1of7.ml | 5 ----- 11 files changed, 12 insertions(+), 48 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 01490129..b6e4460d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -3,6 +3,7 @@ ## Added - Automatic transfers to host from the context that most recently updated a node. +- Automatic transfers of routine's inputs from host to routine's context if the host array modification was not yet transfered. ## Fixed diff --git a/arrayjit/lib/backends.ml b/arrayjit/lib/backends.ml index b90e84f5..4ae8fcc2 100644 --- a/arrayjit/lib/backends.ml +++ b/arrayjit/lib/backends.ml @@ -70,7 +70,9 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin Tn.prepare_read ~is_done:(fun () -> Backend.is_done e) ~sync:(fun () -> Backend.sync e) - ~transfer:(fun () -> assert (to_host ctx tn); Backend.await s) + ~transfer:(fun () -> + assert (to_host ctx tn); + Backend.await s) tn); (* To be on the safe side, record events for potentially cross-stream nodes. *) match tn with @@ -92,6 +94,7 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin (* Stdio.printf "copying: %s from_host\n" (Tn.debug_name tn); *) Backend.from_host ~dst_ptr:dst ~dst:ctx hosted; update_writer_event ~from:`Host ctx @@ Node tn; + tn.host_modified <- false; true | _ -> false @@ -140,6 +143,8 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin let s = r.context.stream in let hosted_inputs = Set.filter r.inputs ~f:(fun tn -> Tn.is_hosted_force tn 47) in let pre () = + assert (Domain.is_main_domain ()); + Set.iter hosted_inputs ~f:(fun tn -> if tn.host_modified then assert (from_host r.context tn)); Set.iter r.inputs ~f:(fun tn -> if Tn.potentially_cross_stream tn then Option.iter (Hashtbl.find s.device.shared_writer_streams tn) ~f:(fun data -> diff --git a/arrayjit/lib/tnode.ml b/arrayjit/lib/tnode.ml index 9d0e6355..3beb022b 100644 --- a/arrayjit/lib/tnode.ml +++ b/arrayjit/lib/tnode.ml @@ -84,6 +84,7 @@ type t = { mutable code_name : string option; mutable prepare_read : prepare option; mutable prepare_write : prepare option; + mutable host_modified : bool; } [@@deriving sexp_of] @@ -553,6 +554,7 @@ let create ?default_prec ~id ~label ~dims init_op = code_name = None; prepare_read = None; prepare_write = None; + host_modified = true; } in (* Note: if tensor nodes get non-trivial finalizers, remember to either add an is_finalized flag @@ -576,6 +578,7 @@ let find = code_name = None; prepare_read = None; prepare_write = None; + host_modified = false; } in fun ~id -> Registry.find_opt registry { mock with id } @@ -592,7 +595,8 @@ let do_read tn = let do_write tn = Option.iter ~f:(fun p -> p.sync ()) tn.prepare_write; - tn.prepare_write <- None + tn.prepare_write <- None; + tn.host_modified <- true let points_1d ?from_axis ~xdim tn = do_read tn; diff --git a/bin/compilation_speed.ml b/bin/compilation_speed.ml index 7f5c8f4a..912f0d26 100644 --- a/bin/compilation_speed.ml +++ b/bin/compilation_speed.ml @@ -39,7 +39,6 @@ let benchmark_overhead backend () = Train.to_routine (module Backend) init_assign_x.context IDX.empty update_f.fwd_bprop in Tensor.print_tree ~with_grad:true ~with_backend_info:true ~depth:9 f; - Tensor.iter_embedded f ~f:(fun a -> ignore (Backend.from_host f_routine.context a : bool)); let xs = Array.init n_data ~f:Float.(fun i -> of_int i - (of_int n_data /. 2.)) in let open Operation.At in diff --git a/bin/hello_world.ml b/bin/hello_world.ml index 6639c049..99c054be 100644 --- a/bin/hello_world.ml +++ b/bin/hello_world.ml @@ -56,8 +56,6 @@ let hello3 () = let y = TDSL.O.(( + ) ~label:[ "y" ] (hey * zero_to_twenty) zero_to_twenty) in Train.set_hosted hey.value; let routine = Train.to_routine (module Backend) ctx IDX.empty @@ Train.forward y in - assert (Backend.from_host routine.context hey.value); - assert (Backend.from_host routine.context zero_to_twenty.value); Tensor.print ~with_code:true ~with_grad:false `Inline zero_to_twenty; Tensor.print ~with_code:true ~with_grad:false `Default zero_to_twenty; Tensor.print_tree ~with_grad:false ~depth:9 zero_to_twenty; diff --git a/bin/micrograd_demo.ml b/bin/micrograd_demo.ml index f3f38d59..8995b623 100644 --- a/bin/micrograd_demo.ml +++ b/bin/micrograd_demo.ml @@ -85,8 +85,6 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () = let routine = Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update.fwd_bprop; sgd ]) in - Train.all_host_to_device (module Backend) routine.context scalar_loss; - Train.all_host_to_device (module Backend) routine.context learning_rate; (* Stdio.print_endline "\n******** scalar_loss **********"; Tensor.print_tree ~with_id:true ~with_grad:false ~depth:9 scalar_loss; Stdio.print_endline "\n******** learning_rate **********"; Tensor.print_tree ~with_id:true ~with_grad:false ~depth:9 learning_rate; @@ -136,7 +134,6 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () = Tn.set_values point.value [| x; y |]; (* For the gccjit backend, point is only on host, not on device. For cuda, this will be needed. *) - assert (Backend.from_host result_routine.context point.value); Train.run result_routine; Float.(mlp_result.@[0] >= 0.) in diff --git a/bin/moons_demo.ml b/bin/moons_demo.ml index bac2d7e3..f0588b3b 100644 --- a/bin/moons_demo.ml +++ b/bin/moons_demo.ml @@ -78,8 +78,6 @@ let demo () = PrintBox_text.output Stdio.stdout plot_moons; Stdio.print_endline "\n"; - Train.all_host_to_device (module Backend) routine.context scalar_loss; - Train.all_host_to_device (module Backend) routine.context learning_rate; let open Operation.At in let step_ref = IDX.find_exn routine.bindings step_n in let batch_ref = IDX.find_exn routine.bindings batch_n in @@ -112,7 +110,6 @@ let demo () = let callback (x, y) = Tn.set_values point.value [| x; y |]; Utils.capture_stdout_logs @@ fun () -> - assert (Backend.from_host result_routine.context point.value); Train.run result_routine; Float.(mlp_result.@[0] >= 0.) in diff --git a/bin/zero2hero_1of7.ml b/bin/zero2hero_1of7.ml index 09934a0b..f6300088 100644 --- a/bin/zero2hero_1of7.ml +++ b/bin/zero2hero_1of7.ml @@ -161,7 +161,6 @@ let () = let routine = Train.to_routine (module Backend) (Backend.make_context stream) IDX.empty update.fwd_bprop in - Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.from_host routine.context a : bool)); Train.run routine; (* Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.to_host routine.context a : bool)); Backend.await stream; *) @@ -176,18 +175,12 @@ let () = @@ Train.sgd_update ~learning_rate update in (* learning_rate is virtual so this will not print anything. *) - Tensor.iter_embedded learning_rate ~f:(fun a -> - ignore (Backend.from_host routine.context a : bool)); Stdio.print_endline {| Due to how the gccjit backend works, since the parameters were constant in the grad_update computation, they did not exist on the device before. Now they do. This would not be needed on the cuda backend.|}; - List.iter [ a.value; b.value; c.value; f.value ] ~f:(fun a -> - assert (Backend.from_host routine.context a)); Train.run routine; - (* Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.to_host routine.context a : bool)); - Backend.await stream; *) Stdio.print_endline {| Now we updated the params, but after the forward and backward passes: diff --git a/lib/train.ml b/lib/train.ml index 7b5e4439..a14bdc42 100644 --- a/lib/train.ml +++ b/lib/train.ml @@ -269,16 +269,6 @@ let every_non_literal_on_host = Tensor.iter_embedded ~f:(fun a -> if Tn.mode_is_unspecified a && not (Tn.known_constant a) then set_hosted a) -(* Note: this will get nicer with modular explicits. *) -let%debug2_sexp all_host_to_device (type buffer_ptr dev runner event) - (module Backend : Backend - with type buffer_ptr = buffer_ptr - and type dev = dev - and type runner = runner - and type event = event) (context : Backend.context) = - let f tn = ignore (Backend.from_host context tn : bool) in - Tensor.iter_embedded ~f - module Lazy = Utils.Lazy (** Performs one optimization step, potentially in parallel (if [grad_updates] are linked with @@ -469,8 +459,6 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init let sgd_update = to_routine (module Backend) grad_updates.(0).context bindings sgd in Tensor.log_debug_info ~from_log_level:2 inputs; Tensor.log_debug_info ~from_log_level:2 outputs; - all_host_to_device (module Backend) sgd_update.context scalar_loss; - all_host_to_device (module Backend) sgd_update.context learning_rate; let open Operation.At in let epoch_loss = ref 0. in let step_ref = IDX.find_exn sgd_update.bindings step_n in @@ -531,7 +519,6 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init (* For the gccjit backend, infer is only on host, not on device. For cuda, this will be needed. *) Utils.capture_stdout_logs @@ fun () -> - assert (Backend.from_host routine.context infer.value); run routine; Tn.get_values model_result.value in @@ -558,7 +545,6 @@ let%track3_sexp forward_and_ctx ?(disable_rootness_check = false) (type buffer_p and type event = event) ctx ?(bindings = IDX.empty) t = let routine = Backend.(link ctx @@ compile bindings @@ forward ~disable_rootness_check t) in if not disable_rootness_check then Tensor.remove_bprop_root t; - Tensor.iter_embedded t ~f:(fun a -> ignore (Backend.from_host routine.context a : bool)); Task.run routine.schedule; routine.context diff --git a/test/micrograd_demo.ml b/test/micrograd_demo.ml index 28b9878a..53ffaa42 100644 --- a/test/micrograd_demo.ml +++ b/test/micrograd_demo.ml @@ -29,7 +29,6 @@ let%expect_test "Micrograd README basic example" = List.iter ~f:(Option.iter ~f:(fun diff -> Train.set_hosted diff.Tensor.grad)) [ a.diff; b.diff ]; let update = Train.grad_update g in let step = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in - Tensor.iter_embedded g ~f:(fun a -> ignore (Backend.from_host step.context a : bool)); Train.run step; Tensor.print ~with_code:false ~with_grad:false `Default g; [%expect @@ -89,13 +88,6 @@ let%expect_test "Micrograd half-moons example" = (* Note: for as-yet unknown reason, this test can lead to different resuls on different versions of dependencies. *) let module Backend = (val Arrayjit.Backends.fresh_backend ~backend_name:"cc" ()) in - let backend = - (module Backend : Backend - with type buffer_ptr = Backend.buffer_ptr - and type dev = Backend.dev - and type runner = Backend.runner - and type event = Backend.event) - in let stream = Backend.(new_stream @@ get_device ~ordinal:0) in let ctx = Backend.make_context stream in let open Operation.At in @@ -148,8 +140,6 @@ let%expect_test "Micrograd half-moons example" = let sgd_routine = Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update.fwd_bprop; sgd ]) in - Train.all_host_to_device backend sgd_routine.context scalar_loss; - Train.all_host_to_device backend sgd_routine.context learning_rate; let step_ref = IDX.find_exn sgd_routine.bindings step_n in step_ref := 0; for _epoch = 1 to epochs do @@ -180,7 +170,6 @@ let%expect_test "Micrograd half-moons example" = Tn.set_values point.value [| x; y |]; (* For the gccjit backend, point is only on host, not on device. For cuda, this will be needed. *) - assert (Backend.from_host result_routine.context point.value); Train.run result_routine; Float.(mlp_result.@[0] >= 0.) in diff --git a/test/zero2hero_1of7.ml b/test/zero2hero_1of7.ml index 86ba9ae7..ae02fecb 100644 --- a/test/zero2hero_1of7.ml +++ b/test/zero2hero_1of7.ml @@ -53,7 +53,6 @@ let%expect_test "Graph drawing recompile" = Train.every_non_literal_on_host f; let f_upd = Train.grad_update f in let f_bprop = Train.to_routine (module Backend) ctx IDX.empty f_upd.fwd_bprop in - Tensor.iter_embedded f ~f:(fun a -> ignore (Backend.from_host f_bprop.context a : bool)); Train.run f_bprop; Tensor.print_tree ~with_grad:true ~depth:9 f; [%expect @@ -279,7 +278,6 @@ let%expect_test "Simple gradients hosted" = |}]; (* Do not update the params: all values and gradients will be at initial points, which are specified in the tensor in the brackets. *) - Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.from_host grad_routine.context a : bool)); Train.run grad_routine; Tensor.print_tree ~with_grad:true ~depth:9 l; [%expect @@ -410,7 +408,6 @@ let%expect_test "Simple gradients virtual" = |}]; (* Do not update the params: all values and gradients will be at initial points, which are specified in the tensor in the brackets. *) - Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.from_host grad_routine.context a : bool)); Train.run grad_routine; Tensor.print_tree ~with_grad:true ~depth:9 l; [%expect @@ -497,7 +494,6 @@ let%expect_test "2D neuron hosted" = Train.every_non_literal_on_host v; let update = Train.grad_update v in let routine = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in - Tensor.iter_embedded v ~f:(fun a -> ignore (Backend.from_host routine.context a : bool)); Train.run routine; Tensor.print_tree ~with_grad:true ~depth:9 v; [%expect @@ -525,7 +521,6 @@ let%expect_test "2D neuron virtual" = let%op v = ("w" [ (-3, 1) ] * "x" [ 2; 0 ]) + "b" [ 6.7 ] in let update = Train.grad_update v in let routine = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in - Tensor.iter_embedded v ~f:(fun a -> ignore (Backend.from_host routine.context a : bool)); Train.run routine; Tensor.print_tree ~with_grad:true ~depth:9 v; [%expect