#3974: nanogpt uplift and move weights to weka path

tenstorrent · Dec 21, 2023 · 2a7cefc · 2a7cefc
1 parent 61348cf
commit 2a7cefc
Show file tree

Hide file tree

Showing 11 changed files with 338 additions and 99 deletions.
diff --git a/models/experimental/nanogpt/nanogpt_utils.py b/models/experimental/nanogpt/nanogpt_utils.py
@@ -0,0 +1,109 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from models.utility_functions import tt2torch_tensor
+import torch
+import tt_lib
+from transformers import GPT2LMHeadModel
+from tt_lib.utils import pad_weight
+from pathlib import Path
+import os
+
+
+def unpad_from_zero(x, desired_shape):
+    if x.shape()[-1] == desired_shape[-1] and x.shape()[-2] == desired_shape[-2]:
+        x = tt2torch_tensor(x)
+    else:
+        x = x.cpu()
+        if x.layout() != tt_lib.tensor.Layout.ROW_MAJOR:
+            x = x.to(tt_lib.tensor.Layout.ROW_MAJOR)
+        x = x.unpad(
+            (0, 0, 0, 0), (desired_shape[0] - 1, desired_shape[1] - 1, desired_shape[2] - 1, desired_shape[3] - 1)
+        )
+        x = x.to_torch().to(torch.float)
+    return x
+
+
+def cache_weights_in_weka(device, dtype, reset_seeds):
+    model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
+    state_dict = model_hf.state_dict()
+    weights_dtype = dtype
+
+    # initial weights are stored in "models/experimental/nanogpt/weights/" and moved to weka path
+    file_name = "models/experimental/nanogpt/weights/"
+    for key, value in state_dict.items():
+        if key.startswith("transformer.wte.") or key.startswith("transformer.wpe."):
+            torch.save(value, file_name + str(key) + ".pt")
+            continue
+        elif len(value.shape) == 0:
+            continue
+        while len(value.shape) < 4:
+            value = value.unsqueeze(0)
+        if value.shape[-2] % 32 == 0 and value.shape[-1] % 32 == 0:
+            value = tt_lib.tensor.Tensor(
+                value.reshape(-1).tolist(),
+                value.shape,
+                weights_dtype,
+                tt_lib.tensor.Layout.ROW_MAJOR,
+            ).to(tt_lib.tensor.Layout.TILE)
+        else:
+            value = pad_weight(value)
+            value = tt_lib.tensor.Tensor(
+                value.reshape(-1).tolist(),
+                value.shape,
+                weights_dtype,
+                tt_lib.tensor.Layout.ROW_MAJOR,
+            ).to(tt_lib.tensor.Layout.TILE)
+        tt_lib.tensor.dump_tensor(file_name + str(key) + str(weights_dtype) + ".bin", value)
+
+
+"""This function will load weights from the state_dict and check if the needed weights are available in given path.
+If they are not available, it will convert torch tensor weights to TT tensor weights and store them in the given path."""
+
+
+def store_weights(model_version, file_name, base_address, dtype):
+    model_hf = GPT2LMHeadModel.from_pretrained(model_version)
+    state_dict = model_hf.state_dict()
+    weights_dtype = dtype
+
+    for key, value in state_dict.items():
+        if base_address == "" and (
+            (key.startswith("transformer.wte.") and os.path.exists(file_name + str(key) + ".pt") == False)
+            or (key.startswith("transformer.wpe.") and os.path.exists(file_name + str(key) + ".pt") == False)
+        ):
+            torch.save(value, file_name + str(key) + ".pt")
+            continue
+        if key.startswith("transformer.wte.") or key.startswith("transformer.wpe.") or (len(value.shape) == 0):
+            continue
+        if (os.path.exists(file_name + str(key) + str(weights_dtype) + ".bin")) or (
+            key.startswith(base_address) == False and base_address != ""
+        ):
+            continue
+        while len(value.shape) < 4:
+            value = value.unsqueeze(0)
+        if value.shape[-2] % 32 == 0 and value.shape[-1] % 32 == 0:
+            value = tt_lib.tensor.Tensor(
+                value.reshape(-1).tolist(),
+                value.shape,
+                weights_dtype,
+                tt_lib.tensor.Layout.ROW_MAJOR,
+            ).to(tt_lib.tensor.Layout.TILE)
+        else:
+            value = pad_weight(value)
+            value = tt_lib.tensor.Tensor(
+                value.reshape(-1).tolist(),
+                value.shape,
+                weights_dtype,
+                tt_lib.tensor.Layout.ROW_MAJOR,
+            ).to(tt_lib.tensor.Layout.TILE)
+        tt_lib.tensor.dump_tensor(file_name + str(key) + str(weights_dtype) + ".bin", value)
+
+
+def get_tt_cache_path(model_version):
+    tt_cache_path = Path("/mnt/MLPerf/tt_dnn-models/tt/NanoGPT") / model_version
+    if tt_cache_path.exists():
+        return str(tt_cache_path) + "/"
+    else:
+        Path(f"models/experimental/nanogpt/datasets/{model_version}").mkdir(parents=True, exist_ok=True)
+        return str(Path(f"models/experimental/nanogpt/datasets/{model_version}")) + "/"
diff --git a/models/experimental/nanogpt/tests/test_nanogpt_attention.py b/models/experimental/nanogpt/tests/test_nanogpt_attention.py
@@ -4,12 +4,15 @@
 
 import torch
 import pytest
+import tt_lib
+import os
+from pathlib import Path
 
 from transformers import GPT2LMHeadModel
 
-
 from loguru import logger
 import models.experimental.nanogpt.tt.nanogpt_attention as nanogpt_attention
+from models.experimental.nanogpt.nanogpt_utils import get_tt_cache_path, store_weights
 
 from models.utility_functions import (
     tt_to_torch_tensor,
@@ -19,16 +22,17 @@
 )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    (tt_lib.tensor.DataType.BFLOAT16,),
+)
 @pytest.mark.parametrize(
     "pcc",
     ((0.99,),),
 )
-
-def test_nanogpt_attn(device, pcc, reset_seeds):
-
+def test_nanogpt_attn(device, pcc, dtype, reset_seeds):
     # Prepare input
     model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
-    sd = model_hf.state_dict()
     config = model_hf.config
     model_hf.eval()
     block = 0
@@ -38,8 +42,17 @@ def test_nanogpt_attn(device, pcc, reset_seeds):
     pt_attn = model_hf.transformer.h[block].attn
     pt_out = pt_attn.forward(test_in)
 
+    model_version = "gpt2"
+    tt_cache_path = get_tt_cache_path(model_version)
+
+    if (
+        tt_cache_path == (str(Path(f"models/experimental/nanogpt/datasets/{model_version}")) + "/")
+        and len(os.listdir(f"models/experimental/nanogpt/datasets/{model_version}")) < 320
+    ):
+        store_weights(model_version=model_version, file_name=tt_cache_path, dtype=dtype, base_address=base_address)
+
     tt_test_in = torch_to_tt_tensor_rm(test_in, device)
-    tt_attn = nanogpt_attention.TtCausalSelfAttention(config, sd, base_address, device)
+    tt_attn = nanogpt_attention.TtCausalSelfAttention(config, base_address, device, tt_cache_path, dtype)
 
     tt_out = tt_attn.forward(tt_test_in)
 

diff --git a/models/experimental/nanogpt/tests/test_nanogpt_block.py b/models/experimental/nanogpt/tests/test_nanogpt_block.py
@@ -4,11 +4,15 @@
 
 import torch
 import pytest
+import tt_lib
+from pathlib import Path
+import os
 
 from transformers import GPT2LMHeadModel
 
 from loguru import logger
 import models.experimental.nanogpt.tt.nanogpt_block as nanogpt_block
+from models.experimental.nanogpt.nanogpt_utils import get_tt_cache_path, store_weights
 
 from models.utility_functions import (
     tt_to_torch_tensor,
@@ -18,14 +22,16 @@
 )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    (tt_lib.tensor.DataType.BFLOAT16,),
+)
 @pytest.mark.parametrize(
     "pcc",
     ((0.99,),),
 )
-def test_nanogpt_block(device, pcc, reset_seeds):
-
+def test_nanogpt_block(device, pcc, dtype, reset_seeds):
     model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
-    sd = model_hf.state_dict()
     config = model_hf.config
     model_hf.eval()
     block = 0
@@ -36,8 +42,16 @@ def test_nanogpt_block(device, pcc, reset_seeds):
     pt_out = pt_block.forward(test_in)
 
     tt_test_in = torch_to_tt_tensor_rm(test_in, device)
+    model_version = "gpt2"
+    tt_cache_path = get_tt_cache_path(model_version)
+
+    if (
+        tt_cache_path == (str(Path(f"models/experimental/nanogpt/datasets/{model_version}")) + "/")
+        and len(os.listdir(f"models/experimental/nanogpt/datasets/{model_version}")) < 320
+    ):
+        store_weights(model_version=model_version, file_name=tt_cache_path, dtype=dtype, base_address=base_address)
 
-    tt_block = nanogpt_block.TtBlock(config, sd, base_address, device)
+    tt_block = nanogpt_block.TtBlock(config, base_address, device, tt_cache_path, dtype)
     tt_block.eval()
 
     tt_out = tt_block.forward(tt_test_in)

diff --git a/models/experimental/nanogpt/tests/test_nanogpt_mlp.py b/models/experimental/nanogpt/tests/test_nanogpt_mlp.py
@@ -4,6 +4,10 @@
 
 import torch
 import pytest
+import tt_lib
+from models.experimental.nanogpt.nanogpt_utils import get_tt_cache_path, store_weights
+from pathlib import Path
+import os
 
 from transformers import GPT2LMHeadModel
 
@@ -19,22 +23,33 @@
 )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    (tt_lib.tensor.DataType.BFLOAT16,),
+)
 @pytest.mark.parametrize(
     "pcc",
     ((0.99,),),
 )
-def test_nanogpt_mlp(device, pcc, reset_seeds):
-
+def test_nanogpt_mlp(device, pcc, dtype, reset_seeds):
     model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
-    sd = model_hf.state_dict()
     config = model_hf.config
     model_hf.eval()
     block = 0
     base_address = f"transformer.h.{block}.mlp"
 
     test_in = torch.rand(1, 43, 768)
     tt_test_in = torch_to_tt_tensor_rm(test_in, device)
-    tt_mlp = nanogpt_mlp.TtMLP(base_address, config, sd, device)
+    model_version = "gpt2"
+    tt_cache_path = get_tt_cache_path(model_version)
+
+    if (
+        tt_cache_path == (str(Path(f"models/experimental/nanogpt/datasets/{model_version}")) + "/")
+        and len(os.listdir(f"models/experimental/nanogpt/datasets/{model_version}")) < 320
+    ):
+        store_weights(model_version=model_version, file_name=tt_cache_path, dtype=dtype, base_address=base_address)
+
+    tt_mlp = nanogpt_mlp.TtMLP(base_address, config, device, tt_cache_path, dtype)
 
     tt_out = tt_mlp.forward(tt_test_in)
 

diff --git a/.../nanogpt/tests/test_nanogpt_model_real.py → ...ental/nanogpt/tests/test_nanogpt_model.py b/.../nanogpt/tests/test_nanogpt_model_real.py → ...ental/nanogpt/tests/test_nanogpt_model.py
@@ -2,28 +2,32 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
+import tt_lib
 import pytest
 
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
+from models.experimental.nanogpt.nanogpt_utils import get_tt_cache_path, store_weights
+from pathlib import Path
+import os
 
 from loguru import logger
 import models.experimental.nanogpt.tt.nanogpt_model as nanogpt_model
 
 from models.utility_functions import tt_to_torch_tensor, comp_allclose, comp_pcc
 
 
-
+@pytest.mark.parametrize(
+    "dtype",
+    (tt_lib.tensor.DataType.BFLOAT16,),
+)
 @pytest.mark.parametrize(
     "pcc, prompt",
-    ((0.99, "Hello, my dog is a little"),),
+    ((0.98, "Hello, my dog is a little"),),
 )
-def test_nanogpt_model_real(device, pcc, prompt, reset_seeds):
-
+def test_nanogpt_model_real(device, pcc, prompt, dtype, reset_seeds):
     # Prepare input
     model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    sd = model_hf.state_dict()
     model_hf.eval()
 
     inputs = tokenizer(prompt, return_tensors="pt", padding=False)
@@ -33,7 +37,17 @@ def test_nanogpt_model_real(device, pcc, prompt, reset_seeds):
 
     config = model_hf.config
 
-    tt_model = nanogpt_model.TtGPT(config, sd, device)
+    base_address = ""
+    model_version = "gpt2"
+    tt_cache_path = get_tt_cache_path(model_version)
+
+    if (
+        tt_cache_path == (str(Path(f"models/experimental/nanogpt/datasets/{model_version}")) + "/")
+        and len(os.listdir(f"models/experimental/nanogpt/datasets/{model_version}")) < 320
+    ):
+        store_weights(model_version=model_version, file_name=tt_cache_path, dtype=dtype, base_address=base_address)
+
+    tt_model = nanogpt_model.TtGPT(config, device, tt_cache_path, dtype)
 
     tt_out = tt_model.forward(inputs.input_ids)
 

diff --git a/models/experimental/nanogpt/tt/nanogpt.py b/models/experimental/nanogpt/tt/nanogpt.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from transformers import GPT2LMHeadModel
+from models.experimental.nanogpt.tt.nanogpt_model import TtGPT
+
+
+def _nanogpt(config, device, tt_cache_path, dtype):
+    return TtGPT(
+        config=config,
+        device=device,
+        tt_cache_path=tt_cache_path,
+        dtype=dtype,
+    )
+
+
+def nanogpt_model(device, dtype) -> TtGPT:
+    model_name = "gpt2"
+    model = GPT2LMHeadModel.from_pretrained(model_name)
+    config = model.config
+    tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"
+    model = _nanogpt(config, device, tt_cache_path, dtype)
+    return model