pytorch · mikekgfb · Apr 7, 2024 · Apr 7, 2024 · Apr 7, 2024 · Apr 7, 2024
diff --git a/.github/workflows/compile-macos12.yml b/.github/workflows/compile-macos12.yml
@@ -0,0 +1,61 @@
+name: Compile main
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+  workflow_dispatch:
+
+jobs:
+  run-tinystories:
+    strategy:
+      matrix:
+        runner: [macos-12]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v2
+      - name: Setup Python
+        uses: actions/setup-python@v2
+        with:
+          python-version: 3.11
+      - name: Print machine info
+        run: |
+          uname -a
+          if [ $(uname -s) == Darwin ]; then
+            sysctl machdep.cpu.brand_string
+            sysctl machdep.cpu.core_count
+          fi
+      - name: Install requirements
+        run: |
+          pip install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cpu
+          pip install -r requirements.txt
+      - name: Download checkpoints
+        run: |
+          mkdir -p checkpoints/stories15M
+          pushd checkpoints/stories15M
+          wget https://huggingface.co/karpathy/tinyllamas/resolve/main/stories15M.pt
+          wget https://github.com/karpathy/llama2.c/raw/master/tokenizer.model
+          popd
+      - name: Run inference
+        run: |          
+          export MODEL_PATH=checkpoints/stories15M/stories15M.pt
+          export MODEL_NAME=stories15M
+          export MODEL_OUT=/tmp
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
+          cat ./output_eager
+          python generate.py --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
+          cat ./output_compiled
+          python export.py --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_OUT}/${MODEL_NAME}.so
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_OUT}/${MODEL_NAME}.so  > ./output_aoti
+          cat ./output_aoti
+          echo "tests complete"
+          echo "******************************************"
+          # echo "********* EAGER vs TORCH.COMPILE *********"
+          # echo "******************************************"
+          # diff output_eager output_compiled
+          # echo "******************************************"
+          # echo "********* EAGER vs AOT INDUCTOR  *********"
+          # echo "******************************************"
+          # diff output_eager output_aoti
diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
@@ -11,7 +11,7 @@ jobs:
   run-tinystories:
     strategy:
       matrix:
-        runner: [ubuntu-latest, macos-12, macos-14]
+        runner: [ubuntu-latest, macos-14]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout repo
@@ -42,13 +42,13 @@ jobs:
         run: |          
           export MODEL_PATH=checkpoints/stories15M/stories15M.pt
           export MODEL_NAME=stories15M
-          export MODEL_DIR=/tmp
+          export MODEL_OUT=/tmp
           python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_eager
           cat ./output_eager
           python generate.py --compile --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
           cat ./output_compiled
-          python export.py --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          python export.py --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_OUT}/${MODEL_NAME}.so
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_OUT}/${MODEL_NAME}.so  > ./output_aoti
           cat ./output_aoti
 
           echo "******************************************"
@@ -58,8 +58,8 @@ jobs:
           cat ./output_eager
           python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
           cat ./output_compiled
-          python export.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          python export.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_OUT}/${MODEL_NAME}.so
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_OUT}/${MODEL_NAME}.so  > ./output_aoti
           cat ./output_aoti
 
           echo "******************************************"
@@ -69,8 +69,8 @@ jobs:
           cat ./output_eager
           python generate.py --compile --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
           cat ./output_compiled
-          python export.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          python export.py --quant '{"embedding" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_OUT}/${MODEL_NAME}.so
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_OUT}/${MODEL_NAME}.so  > ./output_aoti
           cat ./output_aoti
 
           echo "******************************************"
@@ -80,8 +80,8 @@ jobs:
           cat ./output_eager
           python generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
           cat ./output_compiled
-          python export.py --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          python export.py --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 0}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_OUT}/${MODEL_NAME}.so
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_OUT}/${MODEL_NAME}.so  > ./output_aoti
           cat ./output_aoti
 
           echo "******************************************"
@@ -91,8 +91,8 @@ jobs:
           cat ./output_eager
           python generate.py --compile --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --temperature 0 > ./output_compiled
           cat ./output_compiled
-          python export.py --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_DIR}/${MODEL_NAME}.so
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_DIR}/${MODEL_NAME}.so  > ./output_aoti
+          python export.py --quant '{"linear:int8" : {"bitwidth": 8, "group_size": 8}}' --checkpoint-path ${MODEL_PATH} --output-dso-path ${MODEL_OUT}/${MODEL_NAME}.so
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --dso-path ${MODEL_OUT}/${MODEL_NAME}.so  > ./output_aoti
           cat ./output_aoti
 
           echo "tests complete"

diff --git a/.github/workflows/et.yml b/.github/workflows/et.yml
@@ -64,12 +64,13 @@ jobs:
         run: |
           export MODEL_PATH=${PWD}/checkpoints/stories15M/stories15M.pt
           export MODEL_NAME=stories15M
+          export MODEL_OUT=/tmp
+
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ${MODEL_OUT}/output_eager
+          cat ${MODEL_OUT}/output_eager
 
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 > ${PWD}/output_eager
-          cat ${PWD}/output_eager
-
-          python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${PWD}/${MODEL_NAME}.pte
-          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${PWD}/${MODEL_NAME}.pte  > ${PWD}/output_et
-          cat ${PWD}/output_et
+          python export.py --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_OUT}/${MODEL_NAME}.pte
+          python generate.py --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_OUT}/${MODEL_NAME}.pte  > ${MODEL_OUT}/output_et
+          cat ${MODEL_OUT}/output_et
 
           echo "Tests complete."
diff --git a/export.py b/export.py
@@ -16,7 +16,7 @@
     executorch_export_available = True
     from export_et import export_model as export_model_et
 except Exception as e:
-    print("ET EXPORT EXCEPTION: ", e) # TODO: remove
+    # print("ET EXPORT EXCEPTION: ", e) # TODO: remove
     executorch_export_available = False
 
 from export_aoti import export_model as export_model_aoti
@@ -39,15 +39,13 @@ def device_sync(device):
 
 
 class model_wrapper(nn.Module):
-    def __init__(self, model, device):
+    def __init__(self, model, device, max_seq_length = 350):
         super().__init__()
 
-        max_seq_length = 350
         with torch.device(device):
             model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
-
         self.model = model
-        # init model here if necessary
+
 
     def forward(self, idx, input_pos):
         # input_pos: [B, 1]
@@ -74,7 +72,12 @@ def main(checkpoint_path, device, quantize = "{ }", args = None):
 
     quantize_model(model, args.quantize)
     model = model_wrapper(model, device=device)
-
+
+    input = (
+        torch.tensor([[1, 9038, 2501,  263,  931]], dtype=torch.int, device=device),
+        torch.tensor([0, 1, 2, 3, 4], dtype=torch.int, device=device),
+    )
+
     output_pte_path = args.output_pte_path
     output_dso_path = args.output_dso_path
 
@@ -84,13 +87,13 @@ def main(checkpoint_path, device, quantize = "{ }", args = None):
             print(f">{output_pte_path}<")
             if executorch_export_available:
                 print(f"Exporting model using Executorch to {output_pte_path}")
-                export_model_et(model, device, args.output_pte_path, args)
+                export_model_et(model, input, device, args.output_pte_path, args)
             else:
                 print(f"Export with executorch requested but Executorch could not be loaded")
         if output_dso_path:
             output_dso_path = str(os.path.abspath(output_dso_path))
             print(f"Exporting model using AOT Inductor to {output_pte_path}")
-            export_model_aoti(model, device, output_dso_path, args)
+            export_model_aoti(model, input, device, output_dso_path, args)
 
 
 def cli():

diff --git a/export_aoti.py b/export_aoti.py
@@ -31,15 +31,11 @@ def device_sync(device):
         print(f"device={device} is not yet suppported")
 
 
-def export_model(model: nn.Module, device, output_path, args=None):
+def export_model(model: nn.Module, input, device, output_path, args=None):
     max_seq_length = 350
 #    with torch.device(device):
 #        model.setup_caches(max_batch_size=1, max_seq_length=max_seq_length)
 
-    input = (
-        torch.tensor([[1, 9038, 2501,  263,  931]], dtype=torch.int, device=device),
-        torch.tensor([0, 1, 2, 3, 4], dtype=torch.int, device=device),
-    )
 
     print(f"len(input)={len(input)}")
 

diff --git a/export_et.py b/export_et.py
@@ -74,17 +74,17 @@ def canonical_path(path):
     return path
 
 
-def export_model(model, device, output_path, args=None) -> str:  # noqa: C901
+def export_model(model, input, device, output_path, args=None) -> str:  # noqa: C901
 
     # applied wrapper already in export.
     # export_model = model_wrapper(model, device=device)
     export_model = model
     print(export_model)
 
-    input = (
-        torch.tensor([[1]], dtype=torch.long, device=device),
-        torch.tensor([0], dtype=torch.long, device=device),
-    )
+    #input = (
+    #    torch.tensor([[1]], dtype=torch.long, device=device),
+    #    torch.tensor([0], dtype=torch.long, device=device),
+    #)
 
     state_dict = model.state_dict()
     state_dict_dtype = state_dict[next(iter(state_dict))].dtype

diff --git a/model_et.py b/model_et.py
@@ -11,9 +11,9 @@ def __init__(self, config, path) -> None:
         self.config = config
         self.model_ = exec_lib._load_for_executorch(str(path))
 
-    def forward(self, x, input_pos):
+    def forward(self, idx, input_pos):
         # model_.forward expects inputs to be wrapped in a tuple
-        forward_inputs = (x.to(torch.long), input_pos.to(torch.long))
+        forward_inputs = (idx.to(torch.long), input_pos.to(torch.long))
         logits = self.model_.forward(forward_inputs)
 
         # After wrapping in a tuple, we get a list back, so we need to grab