Update shard_irpa_file function

Signed-off-by: aviator19941 <[email protected]>
nod-ai · Oct 31, 2024 · 1ac1e0d · 1ac1e0d
1 parent d74395c
commit 1ac1e0d
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 21 deletions.
diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py
@@ -123,29 +123,33 @@ def wrapper(*args, **kwargs):
     def shard_irpa_file(
         self,
         *,
-        output_file: str,
+        gguf_file: str,
+        output_irpa: str,
     ):
         shard_irpa_args = [
             "python3",
             "-m",
-            "sharktank.models.llama.tools.shard_llama",
-            "--irpa-file",
-            self.irpa_path,
-            "--output-file",
-            output_file,
-            "--shard_count",
+            "sharktank.examples.sharding.shard_llm_dataset",
+            "--gguf-file",
+            gguf_file,
+            "--output-irpa",
+            output_irpa,
+            "--tensor-parallelism-size",
             str(self.tensor_parallelism_size),
         ]
 
         cwd = self.sharktank_dir
         cmd = subprocess.list2cmdline(shard_irpa_args)
+        import pdb
+
+        pdb.set_trace()
 
         logger.info(f"Sharding irpa file:\n" f"cd {cwd} && {cmd}")
 
         proc = subprocess.run(cmd, shell=True, capture_output=True, cwd=cwd, text=True)
         if proc.returncode != 0:
             logger.error(
-                f"Error sharding irpa file with shard_llama.py\n"
+                f"Error sharding irpa file with shard_llm_dataset.py\n"
                 f"{proc.stdout+proc.stderr}"
             )
         else:

diff --git a/sharktank/tests/models/llama/benchmark_amdgpu_test.py b/sharktank/tests/models/llama/benchmark_amdgpu_test.py
@@ -57,6 +57,7 @@ def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama-3.1/8b")
+        self.gguf_path = self.artifacts_dir / "llama8b_f16.gguf"
         self.irpa_path = self.artifacts_dir / "llama8b_f16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "llama8b_fp8.irpa"
         self.tensor_parallelism_size = 1
@@ -151,7 +152,7 @@ def testBenchmark8B_f16_Decomposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama8b_f16_decomposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -201,7 +202,7 @@ def testBenchmark8B_f16_Decodeposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama8b_f16_decodeposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -252,7 +253,7 @@ def testBenchmark8B_fp8_Decomposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama8b_fp8_decomposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -303,7 +304,7 @@ def testBenchmark8B_fp8_Decodeposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama8b_fp8_decodeposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -342,6 +343,7 @@ def setUp(self):
         super().setUp()
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama-3.1/70b")
+        self.gguf_path = self.artifacts_dir / "llama70b_f16.gguf"
         self.irpa_path = self.artifacts_dir / "llama70b_f16.irpa"
         self.irpa_path_fp8 = self.artifacts_dir / "llama70b_fp8.irpa"
         self.tensor_parallelism_size = 1
@@ -439,7 +441,7 @@ def testBenchmark70B_f16_Decomposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama70b_f16_decomposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -471,7 +473,9 @@ def testBenchmark70B_f16_Decomposed(self):
         )
 
     @pytest.mark.xfail(
-        reason="Test not yet implemented", strict=True, raises=ExportMlirException
+        reason="'tm_tensor.attention' op query and mask batch dimension mismatch",
+        strict=True,
+        raises=IreeCompileException,
     )
     def testBenchmark70B_f16_Decodeposed(self):
         output_file_name = self.dir_path_70b / "f16_torch"
@@ -490,7 +494,7 @@ def testBenchmark70B_f16_Decodeposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama70b_f16_decodeposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -541,7 +545,7 @@ def testBenchmark70B_fp8_Decomposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama70b_fp8_decomposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -592,7 +596,7 @@ def testBenchmark70B_fp8_Decodeposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama70b_fp8_decodeposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -632,6 +636,7 @@ def setUp(self):
         # TODO: add numpy files to Azure and download from it
         self.artifacts_dir = Path("/data/llama-3.1/405b")
         self.irpa_path = self.artifacts_dir / "llama405b_f16.irpa"
+        self.gguf_path = self.artifacts_dir / "llama3_405b_full_combined.gguf"
         self.irpa_path_fp8 = self.artifacts_dir / "llama405b_fp8.irpa"
         self.tensor_parallelism_size = 8
         self.dir_path_405b = self.dir_path / "llama-405b"
@@ -728,7 +733,7 @@ def testBenchmark405B_f16_Decomposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama405b_f16_decomposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -779,7 +784,7 @@ def testBenchmark405B_f16_Decodeposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama405b_f16_decodeposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -830,7 +835,7 @@ def testBenchmark405B_fp8_Decomposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama405b_fp8_decomposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name
@@ -881,7 +886,7 @@ def testBenchmark405B_fp8_Decodeposed(self):
         )
         # shard_irpa file
         shard_return_code = self.llama405b_fp8_decodeposed_artifacts.shard_irpa_file(
-            output_file=output_shard_file_name
+            gguf_file=self.gguf_path, output_irpa=output_shard_file_name
         )
         if shard_return_code == 0:
             self.irpa_path = output_shard_file_name