From 6d41b754440e0a656aa8f2acebb1054c2db7ea5c Mon Sep 17 00:00:00 2001
From: Lukasz Stafiniak <lukstafi@gmail.com>
Date: Mon, 30 Dec 2024 21:28:04 +0100
Subject: [PATCH] Automated from_host transfers

---
 CHANGES.md               |  1 +
 arrayjit/lib/backends.ml |  7 ++++++-
 arrayjit/lib/tnode.ml    |  6 +++++-
 bin/compilation_speed.ml |  1 -
 bin/hello_world.ml       |  2 --
 bin/micrograd_demo.ml    |  3 ---
 bin/moons_demo.ml        |  3 ---
 bin/zero2hero_1of7.ml    |  7 -------
 lib/train.ml             | 14 --------------
 test/micrograd_demo.ml   | 11 -----------
 test/zero2hero_1of7.ml   |  5 -----
 11 files changed, 12 insertions(+), 48 deletions(-)

diff --git a/CHANGES.md b/CHANGES.md
index 01490129..b6e4460d 100644
--- a/CHANGES.md
+++ b/CHANGES.md
@@ -3,6 +3,7 @@
 ## Added
 
 - Automatic transfers to host from the context that most recently updated a node.
+- Automatic transfers of routine's inputs from host to routine's context if the host array modification was not yet transfered.
 
 ## Fixed
 
diff --git a/arrayjit/lib/backends.ml b/arrayjit/lib/backends.ml
index b90e84f5..4ae8fcc2 100644
--- a/arrayjit/lib/backends.ml
+++ b/arrayjit/lib/backends.ml
@@ -70,7 +70,9 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin
         Tn.prepare_read
           ~is_done:(fun () -> Backend.is_done e)
           ~sync:(fun () -> Backend.sync e)
-          ~transfer:(fun () -> assert (to_host ctx tn); Backend.await s)
+          ~transfer:(fun () ->
+            assert (to_host ctx tn);
+            Backend.await s)
           tn);
     (* To be on the safe side, record events for potentially cross-stream nodes. *)
     match tn with
@@ -92,6 +94,7 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin
         (* Stdio.printf "copying: %s from_host\n" (Tn.debug_name tn); *)
         Backend.from_host ~dst_ptr:dst ~dst:ctx hosted;
         update_writer_event ~from:`Host ctx @@ Node tn;
+        tn.host_modified <- false;
         true
     | _ -> false
 
@@ -140,6 +143,8 @@ module Add_buffer_retrieval_and_syncing (Backend : No_buffer_retrieval_or_syncin
     let s = r.context.stream in
     let hosted_inputs = Set.filter r.inputs ~f:(fun tn -> Tn.is_hosted_force tn 47) in
     let pre () =
+      assert (Domain.is_main_domain ());
+      Set.iter hosted_inputs ~f:(fun tn -> if tn.host_modified then assert (from_host r.context tn));
       Set.iter r.inputs ~f:(fun tn ->
           if Tn.potentially_cross_stream tn then
             Option.iter (Hashtbl.find s.device.shared_writer_streams tn) ~f:(fun data ->
diff --git a/arrayjit/lib/tnode.ml b/arrayjit/lib/tnode.ml
index 9d0e6355..3beb022b 100644
--- a/arrayjit/lib/tnode.ml
+++ b/arrayjit/lib/tnode.ml
@@ -84,6 +84,7 @@ type t = {
   mutable code_name : string option;
   mutable prepare_read : prepare option;
   mutable prepare_write : prepare option;
+  mutable host_modified : bool;
 }
 [@@deriving sexp_of]
 
@@ -553,6 +554,7 @@ let create ?default_prec ~id ~label ~dims init_op =
       code_name = None;
       prepare_read = None;
       prepare_write = None;
+      host_modified = true;
     }
   in
   (* Note: if tensor nodes get non-trivial finalizers, remember to either add an is_finalized flag
@@ -576,6 +578,7 @@ let find =
       code_name = None;
       prepare_read = None;
       prepare_write = None;
+      host_modified = false;
     }
   in
   fun ~id -> Registry.find_opt registry { mock with id }
@@ -592,7 +595,8 @@ let do_read tn =
 
 let do_write tn =
   Option.iter ~f:(fun p -> p.sync ()) tn.prepare_write;
-  tn.prepare_write <- None
+  tn.prepare_write <- None;
+  tn.host_modified <- true
 
 let points_1d ?from_axis ~xdim tn =
   do_read tn;
diff --git a/bin/compilation_speed.ml b/bin/compilation_speed.ml
index 7f5c8f4a..912f0d26 100644
--- a/bin/compilation_speed.ml
+++ b/bin/compilation_speed.ml
@@ -39,7 +39,6 @@ let benchmark_overhead backend () =
     Train.to_routine (module Backend) init_assign_x.context IDX.empty update_f.fwd_bprop
   in
   Tensor.print_tree ~with_grad:true ~with_backend_info:true ~depth:9 f;
-  Tensor.iter_embedded f ~f:(fun a -> ignore (Backend.from_host f_routine.context a : bool));
 
   let xs = Array.init n_data ~f:Float.(fun i -> of_int i - (of_int n_data /. 2.)) in
   let open Operation.At in
diff --git a/bin/hello_world.ml b/bin/hello_world.ml
index 6639c049..99c054be 100644
--- a/bin/hello_world.ml
+++ b/bin/hello_world.ml
@@ -56,8 +56,6 @@ let hello3 () =
   let y = TDSL.O.(( + ) ~label:[ "y" ] (hey * zero_to_twenty) zero_to_twenty) in
   Train.set_hosted hey.value;
   let routine = Train.to_routine (module Backend) ctx IDX.empty @@ Train.forward y in
-  assert (Backend.from_host routine.context hey.value);
-  assert (Backend.from_host routine.context zero_to_twenty.value);
   Tensor.print ~with_code:true ~with_grad:false `Inline zero_to_twenty;
   Tensor.print ~with_code:true ~with_grad:false `Default zero_to_twenty;
   Tensor.print_tree ~with_grad:false ~depth:9 zero_to_twenty;
diff --git a/bin/micrograd_demo.ml b/bin/micrograd_demo.ml
index f3f38d59..8995b623 100644
--- a/bin/micrograd_demo.ml
+++ b/bin/micrograd_demo.ml
@@ -85,8 +85,6 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () =
   let routine =
     Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update.fwd_bprop; sgd ])
   in
-  Train.all_host_to_device (module Backend) routine.context scalar_loss;
-  Train.all_host_to_device (module Backend) routine.context learning_rate;
   (* Stdio.print_endline "\n******** scalar_loss **********"; Tensor.print_tree ~with_id:true
      ~with_grad:false ~depth:9 scalar_loss; Stdio.print_endline "\n******** learning_rate
      **********"; Tensor.print_tree ~with_id:true ~with_grad:false ~depth:9 learning_rate;
@@ -136,7 +134,6 @@ let experiment seed ~no_batch_shape_inference ~use_builtin_weight_decay () =
     Tn.set_values point.value [| x; y |];
     (* For the gccjit backend, point is only on host, not on device. For cuda, this will be
        needed. *)
-    assert (Backend.from_host result_routine.context point.value);
     Train.run result_routine;
     Float.(mlp_result.@[0] >= 0.)
   in
diff --git a/bin/moons_demo.ml b/bin/moons_demo.ml
index bac2d7e3..f0588b3b 100644
--- a/bin/moons_demo.ml
+++ b/bin/moons_demo.ml
@@ -78,8 +78,6 @@ let demo () =
   PrintBox_text.output Stdio.stdout plot_moons;
   Stdio.print_endline "\n";
 
-  Train.all_host_to_device (module Backend) routine.context scalar_loss;
-  Train.all_host_to_device (module Backend) routine.context learning_rate;
   let open Operation.At in
   let step_ref = IDX.find_exn routine.bindings step_n in
   let batch_ref = IDX.find_exn routine.bindings batch_n in
@@ -112,7 +110,6 @@ let demo () =
   let callback (x, y) =
     Tn.set_values point.value [| x; y |];
     Utils.capture_stdout_logs @@ fun () ->
-    assert (Backend.from_host result_routine.context point.value);
     Train.run result_routine;
     Float.(mlp_result.@[0] >= 0.)
   in
diff --git a/bin/zero2hero_1of7.ml b/bin/zero2hero_1of7.ml
index 09934a0b..f6300088 100644
--- a/bin/zero2hero_1of7.ml
+++ b/bin/zero2hero_1of7.ml
@@ -161,7 +161,6 @@ let  () =
   let routine =
     Train.to_routine (module Backend) (Backend.make_context stream) IDX.empty update.fwd_bprop
   in
-  Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.from_host routine.context a : bool));
   Train.run routine;
   (* Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.to_host routine.context a : bool));
      Backend.await stream; *)
@@ -176,18 +175,12 @@ let  () =
     @@ Train.sgd_update ~learning_rate update
   in
   (* learning_rate is virtual so this will not print anything. *)
-  Tensor.iter_embedded learning_rate ~f:(fun a ->
-      ignore (Backend.from_host routine.context a : bool));
   Stdio.print_endline
     {|
       Due to how the gccjit backend works, since the parameters were constant in the grad_update
       computation, they did not exist on the device before. Now they do. This would not be needed
       on the cuda backend.|};
-  List.iter [ a.value; b.value; c.value; f.value ] ~f:(fun a ->
-      assert (Backend.from_host routine.context a));
   Train.run routine;
-  (* Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.to_host routine.context a : bool));
-     Backend.await stream; *)
   Stdio.print_endline
     {|
       Now we updated the params, but after the forward and backward passes:
diff --git a/lib/train.ml b/lib/train.ml
index 7b5e4439..a14bdc42 100644
--- a/lib/train.ml
+++ b/lib/train.ml
@@ -269,16 +269,6 @@ let every_non_literal_on_host =
   Tensor.iter_embedded ~f:(fun a ->
       if Tn.mode_is_unspecified a && not (Tn.known_constant a) then set_hosted a)
 
-(* Note: this will get nicer with modular explicits. *)
-let%debug2_sexp all_host_to_device (type buffer_ptr dev runner event)
-    (module Backend : Backend
-      with type buffer_ptr = buffer_ptr
-       and type dev = dev
-       and type runner = runner
-       and type event = event) (context : Backend.context) =
-  let f tn = ignore (Backend.from_host context tn : bool) in
-  Tensor.iter_embedded ~f
-
 module Lazy = Utils.Lazy
 
 (** Performs one optimization step, potentially in parallel (if [grad_updates] are linked with
@@ -469,8 +459,6 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
   let sgd_update = to_routine (module Backend) grad_updates.(0).context bindings sgd in
   Tensor.log_debug_info ~from_log_level:2 inputs;
   Tensor.log_debug_info ~from_log_level:2 outputs;
-  all_host_to_device (module Backend) sgd_update.context scalar_loss;
-  all_host_to_device (module Backend) sgd_update.context learning_rate;
   let open Operation.At in
   let epoch_loss = ref 0. in
   let step_ref = IDX.find_exn sgd_update.bindings step_n in
@@ -531,7 +519,6 @@ let example_train_loop ?(disable_rootness_check = false) ~seed ~batch_size ~init
     (* For the gccjit backend, infer is only on host, not on device. For cuda, this will be
        needed. *)
     Utils.capture_stdout_logs @@ fun () ->
-    assert (Backend.from_host routine.context infer.value);
     run routine;
     Tn.get_values model_result.value
   in
@@ -558,7 +545,6 @@ let%track3_sexp forward_and_ctx ?(disable_rootness_check = false) (type buffer_p
        and type event = event) ctx ?(bindings = IDX.empty) t =
   let routine = Backend.(link ctx @@ compile bindings @@ forward ~disable_rootness_check t) in
   if not disable_rootness_check then Tensor.remove_bprop_root t;
-  Tensor.iter_embedded t ~f:(fun a -> ignore (Backend.from_host routine.context a : bool));
   Task.run routine.schedule;
   routine.context
 
diff --git a/test/micrograd_demo.ml b/test/micrograd_demo.ml
index 28b9878a..53ffaa42 100644
--- a/test/micrograd_demo.ml
+++ b/test/micrograd_demo.ml
@@ -29,7 +29,6 @@ let%expect_test "Micrograd README basic example" =
   List.iter ~f:(Option.iter ~f:(fun diff -> Train.set_hosted diff.Tensor.grad)) [ a.diff; b.diff ];
   let update = Train.grad_update g in
   let step = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in
-  Tensor.iter_embedded g ~f:(fun a -> ignore (Backend.from_host step.context a : bool));
   Train.run step;
   Tensor.print ~with_code:false ~with_grad:false `Default g;
   [%expect
@@ -89,13 +88,6 @@ let%expect_test "Micrograd half-moons example" =
   (* Note: for as-yet unknown reason, this test can lead to different resuls on different versions
      of dependencies. *)
   let module Backend = (val Arrayjit.Backends.fresh_backend ~backend_name:"cc" ()) in
-  let backend =
-    (module Backend : Backend
-      with type buffer_ptr = Backend.buffer_ptr
-       and type dev = Backend.dev
-       and type runner = Backend.runner
-       and type event = Backend.event)
-  in
   let stream = Backend.(new_stream @@ get_device ~ordinal:0) in
   let ctx = Backend.make_context stream in
   let open Operation.At in
@@ -148,8 +140,6 @@ let%expect_test "Micrograd half-moons example" =
   let sgd_routine =
     Train.to_routine (module Backend) ctx bindings (Asgns.sequence [ update.fwd_bprop; sgd ])
   in
-  Train.all_host_to_device backend sgd_routine.context scalar_loss;
-  Train.all_host_to_device backend sgd_routine.context learning_rate;
   let step_ref = IDX.find_exn sgd_routine.bindings step_n in
   step_ref := 0;
   for _epoch = 1 to epochs do
@@ -180,7 +170,6 @@ let%expect_test "Micrograd half-moons example" =
     Tn.set_values point.value [| x; y |];
     (* For the gccjit backend, point is only on host, not on device. For cuda, this will be
        needed. *)
-    assert (Backend.from_host result_routine.context point.value);
     Train.run result_routine;
     Float.(mlp_result.@[0] >= 0.)
   in
diff --git a/test/zero2hero_1of7.ml b/test/zero2hero_1of7.ml
index 86ba9ae7..ae02fecb 100644
--- a/test/zero2hero_1of7.ml
+++ b/test/zero2hero_1of7.ml
@@ -53,7 +53,6 @@ let%expect_test "Graph drawing recompile" =
   Train.every_non_literal_on_host f;
   let f_upd = Train.grad_update f in
   let f_bprop = Train.to_routine (module Backend) ctx IDX.empty f_upd.fwd_bprop in
-  Tensor.iter_embedded f ~f:(fun a -> ignore (Backend.from_host f_bprop.context a : bool));
   Train.run f_bprop;
   Tensor.print_tree ~with_grad:true ~depth:9 f;
   [%expect
@@ -279,7 +278,6 @@ let%expect_test "Simple gradients hosted" =
     |}];
   (* Do not update the params: all values and gradients will be at initial points, which are
      specified in the tensor in the brackets. *)
-  Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.from_host grad_routine.context a : bool));
   Train.run grad_routine;
   Tensor.print_tree ~with_grad:true ~depth:9 l;
   [%expect
@@ -410,7 +408,6 @@ let%expect_test "Simple gradients virtual" =
     |}];
   (* Do not update the params: all values and gradients will be at initial points, which are
      specified in the tensor in the brackets. *)
-  Tensor.iter_embedded l ~f:(fun a -> ignore (Backend.from_host grad_routine.context a : bool));
   Train.run grad_routine;
   Tensor.print_tree ~with_grad:true ~depth:9 l;
   [%expect
@@ -497,7 +494,6 @@ let%expect_test "2D neuron hosted" =
   Train.every_non_literal_on_host v;
   let update = Train.grad_update v in
   let routine = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in
-  Tensor.iter_embedded v ~f:(fun a -> ignore (Backend.from_host routine.context a : bool));
   Train.run routine;
   Tensor.print_tree ~with_grad:true ~depth:9 v;
   [%expect
@@ -525,7 +521,6 @@ let%expect_test "2D neuron virtual" =
   let%op v = ("w" [ (-3, 1) ] * "x" [ 2; 0 ]) + "b" [ 6.7 ] in
   let update = Train.grad_update v in
   let routine = Train.to_routine (module Backend) ctx IDX.empty update.fwd_bprop in
-  Tensor.iter_embedded v ~f:(fun a -> ignore (Backend.from_host routine.context a : bool));
   Train.run routine;
   Tensor.print_tree ~with_grad:true ~depth:9 v;
   [%expect