microsoft · kunal-vaishnavi · Aug 16, 2024 · Jul 1, 2024 · Jul 23, 2024 · Aug 12, 2024
diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py
@@ -11,6 +11,7 @@
 import shutil
 import subprocess
 import sys
+import tempfile
 from itertools import chain
 
 import onnx
@@ -113,34 +114,6 @@ def save_onnx_model(onnx_model: onnx.ModelProto, output_path: str, data_path: st
     )
 
 
-# Notes:
-# 1) Dynamo export will not work automatically until this issue is resolved: https://github.com/microsoft/onnxscript/issues/493
-#
-# 2) Dynamo export will run manually if you set the ONNX file path to the same path that you use to save the model after export.
-# In other words, the value of `temp_path` should be set as the ONNX file path. You can open the issue in your browser to find
-# the location in ONNX Script where you have to make this change.
-#
-# Once the issue is resolved, we hope to modify the code below as follows for each export.
-#
-# Before:
-# temp_dir = args.output
-# temp_path = os.path.join(temp_dir, "temp.onnx")
-# ...
-# ...
-# ...
-# del onnx_model
-# os.system(f"rm {os.path.join(temp_dir, 'model.*')} && rm {os.path.join(temp_dir, '*.weight')} && rm {temp_path}")
-#
-#
-# After:
-# temp_dir = tempfile.TemporaryDirectory()
-# temp_path = os.path.join(temp_dir.name, "temp.onnx")
-# ...
-# ...
-# ...
-# del onnx_model
-# temp_dir.cleanup()
-#
 def run_dynamo_export(
     args: argparse.Namespace, l_config: AutoConfig, llama: AutoModelForCausalLM, rank: int = 0, world_size: int = 1
 ):
@@ -149,35 +122,25 @@ def run_dynamo_export(
     config.capture_scalar_outputs = True
 
     # Dummy values for export
-    batch_size, sequence_length = 2, 8
-    device = torch.device("cpu")
-
-    # Export decoder_model.onnx
-    input_ids, attn_mask, pos_ids = get_sample_inputs(l_config, device, batch_size, sequence_length)
-    temp_dir = args.output  # tempfile.TemporaryDirectory()
-    temp_path = os.path.join(temp_dir, "temp.onnx")  # os.path.join(temp_dir.name, "temp.onnx")
-    torch.onnx.dynamo_export(
-        llama, input_ids, attn_mask, pos_ids, export_options=torch.onnx.ExportOptions(dynamic_shapes=True)
-    ).save(temp_path)
-
-    # Check decoder_model.onnx and save all external data to one file
-    onnx.checker.check_model(temp_path)
-    onnx.shape_inference.infer_shapes_path(temp_path)
+    batch_size, sequence_length, past_sequence_length = 2, 8, 0
+    device = llama.device if args.model_name == "Llama-2-70b-hf" else torch.device("cpu")
 
-    output_path = os.path.join(args.output, f"rank_{rank}_{args.model_name}_decoder_model_fp32.onnx")
-    onnx_model = onnx.load_model(temp_path, load_external_data=True)
-    save_onnx_model(onnx_model, output_path, f"rank_{rank}_{args.model_name}_decoder_model_fp32.onnx.data")
-    del onnx_model
-    os.system(
-        f"rm {os.path.join(temp_dir, 'model.*')} && rm {os.path.join(temp_dir, '*.weight')} && rm {temp_path}"
-    )  # temp_dir.cleanup()
+    temp_name = args.model_name.lower().replace("-", "").replace("_", "")
+    max_sequence_length = 16384 if "codellama" in temp_name else 4096 if "llama2" in temp_name else 2048
 
     # Export decoder_with_past_model.onnx
-    input_ids, attn_mask, pos_ids, past_kv = get_sample_with_past_kv_inputs(
-        l_config, device, batch_size, sequence_length, world_size=world_size
+    input_ids, attn_mask, pos_ids, past_kv = get_merged_sample_with_past_kv_inputs(
+        l_config,
+        device,
+        batch_size,
+        sequence_length,
+        past_sequence_length,
+        max_seq_len=max_sequence_length,
+        use_fp16=False,
+        world_size=world_size,
     )
-    temp_dir = args.output  # tempfile.TemporaryDirectory()
-    temp_path = os.path.join(temp_dir, "temp.onnx")  # os.path.join(temp_dir.name, "temp.onnx")
+    temp_dir = tempfile.TemporaryDirectory()
+    temp_path = os.path.join(temp_dir.name, "temp.onnx")
     torch.onnx.dynamo_export(
         llama, input_ids, attn_mask, pos_ids, past_kv, export_options=torch.onnx.ExportOptions(dynamic_shapes=True)
     ).save(temp_path)
@@ -190,9 +153,7 @@ def run_dynamo_export(
     onnx_model = onnx.load_model(temp_path, load_external_data=True)
     save_onnx_model(onnx_model, output_path, f"rank_{rank}_{args.model_name}_decoder_with_past_model_fp32.onnx.data")
     del onnx_model
-    os.system(
-        f"rm {os.path.join(temp_dir, 'model.*')} && rm {os.path.join(temp_dir, '*.weight')} && rm {temp_path}"
-    )  # temp_dir.cleanup()
+    temp_dir.cleanup()
 
     logger.info(f"The {args.model_name} ONNX model has been successfully created with the Dynamo exporter!")
 
@@ -869,7 +830,7 @@ def main():
 
             # Export to ONNX
             if missing_separate_exports or missing_merged_export:
-                if args.use_dynamo_export and missing_separate_exports:
+                if args.use_dynamo_export:
                     logger.warning("Please ensure you have installed PyTorch, ONNX, and ONNX Script as follows.")
                     logger.warning("Step 1 - PyTorch nightly: https://pytorch.org/get-started/locally/")
                     logger.warning("Step 2 - ONNX weekly: https://pypi.org/project/onnx-weekly/")
@@ -902,7 +863,10 @@ def main():
                 decoder_merged_model_fp32_opt_path,
             ]
 
-            # Run the optimizer script
+            if args.use_dynamo_export:
+                continue
+
+            # Run the optimizer script.
             logger.info("Optimizing models...")
             for orig_path, opt_path in zip(old_paths, new_paths):
                 if os.path.exists(orig_path):
@@ -1007,6 +971,9 @@ def main():
                         remove_existing_model(fp_path)
         barrier()
 
+    if args.use_dynamo_export:
+        return
+
     logger.info("Verifying parity on all ONNX models created")
 
     # Use FP32 precision for FP32, INT8, INT4 CPU models, use FP16 precision for FP16 and INT4 GPU models