#3974: nanogpt uplift and move weights to weka path

tenstorrent · Dec 8, 2023 · 7d782bf · 7d782bf
1 parent 5aeccad
commit 7d782bf
Show file tree

Hide file tree

Showing 11 changed files with 247 additions and 100 deletions.
diff --git a/models/experimental/nanogpt/nanogpt_utils.py b/models/experimental/nanogpt/nanogpt_utils.py
@@ -0,0 +1,60 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from models.utility_functions import tt2torch_tensor
+import torch
+import tt_lib
+from transformers import GPT2LMHeadModel
+from tt_lib.utils import pad_weight
+
+
+def unpad_from_zero(x, desired_shape):
+    if x.shape()[-1] == desired_shape[-1] and x.shape()[-2] == desired_shape[-2]:
+        x = tt2torch_tensor(x)
+    else:
+        x = x.cpu()
+        if x.layout() != tt_lib.tensor.Layout.ROW_MAJOR:
+            x = x.to(tt_lib.tensor.Layout.ROW_MAJOR)
+        x = x.unpad(
+            (0, 0, 0, 0), (desired_shape[0] - 1, desired_shape[1] - 1, desired_shape[2] - 1, desired_shape[3] - 1)
+        )
+        x = x.to_torch().to(torch.float)
+    return x
+
+
+def cache_weights_in_weka(device, dtype, reset_seeds):
+    model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
+    state_dict = model_hf.state_dict()
+    weights_dtype = dtype
+
+    # initial weights are stored in "models/experimental/nanogpt/weights/" and moved to weka path
+    file_name = "models/experimental/nanogpt/weights/"
+    for key, value in state_dict.items():
+        if key.startswith("transformer.wte.") or key.startswith("transformer.wpe."):
+            torch.save(value, file_name + str(key) + ".pt")
+            continue
+        elif len(value.shape) == 0:
+            continue
+        if len(value.shape) == 1:
+            value = value.unsqueeze(0).unsqueeze(0).unsqueeze(0)
+        elif len(value.shape) == 3:
+            value = value.unsqueeze(0)
+        elif len(value.shape) == 2:
+            value = value.unsqueeze(0).unsqueeze(0)
+        if value.shape[-2] % 32 == 0 and value.shape[-1] % 32 == 0:
+            value = tt_lib.tensor.Tensor(
+                value.reshape(-1).tolist(),
+                value.shape,
+                weights_dtype,
+                tt_lib.tensor.Layout.ROW_MAJOR,
+            ).to(tt_lib.tensor.Layout.TILE)
+        else:
+            value = pad_weight(value)
+            value = tt_lib.tensor.Tensor(
+                value.reshape(-1).tolist(),
+                value.shape,
+                weights_dtype,
+                tt_lib.tensor.Layout.ROW_MAJOR,
+            ).to(tt_lib.tensor.Layout.TILE)
+        tt_lib.tensor.dump_tensor(file_name + str(key) + str(weights_dtype) + ".bin", value)
diff --git a/models/experimental/nanogpt/tests/test_nanogpt_attention.py b/models/experimental/nanogpt/tests/test_nanogpt_attention.py
@@ -4,10 +4,10 @@
 
 import torch
 import pytest
+import tt_lib
 
 from transformers import GPT2LMHeadModel
 
-
 from loguru import logger
 import models.experimental.nanogpt.tt.nanogpt_attention as nanogpt_attention
 
@@ -19,16 +19,17 @@
 )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    (tt_lib.tensor.DataType.BFLOAT16,),
+)
 @pytest.mark.parametrize(
     "pcc",
     ((0.99,),),
 )
-
-def test_nanogpt_attn(device, pcc, reset_seeds):
-
+def test_nanogpt_attn(device, pcc, dtype, reset_seeds):
     # Prepare input
     model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
-    sd = model_hf.state_dict()
     config = model_hf.config
     model_hf.eval()
     block = 0
@@ -38,8 +39,10 @@ def test_nanogpt_attn(device, pcc, reset_seeds):
     pt_attn = model_hf.transformer.h[block].attn
     pt_out = pt_attn.forward(test_in)
 
+    tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"
+
     tt_test_in = torch_to_tt_tensor_rm(test_in, device)
-    tt_attn = nanogpt_attention.TtCausalSelfAttention(config, sd, base_address, device)
+    tt_attn = nanogpt_attention.TtCausalSelfAttention(config, base_address, device, tt_cache_path, dtype)
 
     tt_out = tt_attn.forward(tt_test_in)
 

diff --git a/models/experimental/nanogpt/tests/test_nanogpt_block.py b/models/experimental/nanogpt/tests/test_nanogpt_block.py
@@ -4,6 +4,7 @@
 
 import torch
 import pytest
+import tt_lib
 
 from transformers import GPT2LMHeadModel
 
@@ -18,14 +19,16 @@
 )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    (tt_lib.tensor.DataType.BFLOAT16,),
+)
 @pytest.mark.parametrize(
     "pcc",
     ((0.99,),),
 )
-def test_nanogpt_block(device, pcc, reset_seeds):
-
+def test_nanogpt_block(device, pcc, dtype, reset_seeds):
     model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
-    sd = model_hf.state_dict()
     config = model_hf.config
     model_hf.eval()
     block = 0
@@ -36,8 +39,9 @@ def test_nanogpt_block(device, pcc, reset_seeds):
     pt_out = pt_block.forward(test_in)
 
     tt_test_in = torch_to_tt_tensor_rm(test_in, device)
+    tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"
 
-    tt_block = nanogpt_block.TtBlock(config, sd, base_address, device)
+    tt_block = nanogpt_block.TtBlock(config, base_address, device, tt_cache_path, dtype)
     tt_block.eval()
 
     tt_out = tt_block.forward(tt_test_in)

diff --git a/models/experimental/nanogpt/tests/test_nanogpt_mlp.py b/models/experimental/nanogpt/tests/test_nanogpt_mlp.py
@@ -4,13 +4,13 @@
 
 import torch
 import pytest
+import tt_lib
 
 from transformers import GPT2LMHeadModel
 
 from loguru import logger
 import models.experimental.nanogpt.tt.nanogpt_mlp as nanogpt_mlp
 
-
 from models.utility_functions import (
     tt_to_torch_tensor,
     torch_to_tt_tensor_rm,
@@ -19,22 +19,25 @@
 )
 
 
+@pytest.mark.parametrize(
+    "dtype",
+    (tt_lib.tensor.DataType.BFLOAT16,),
+)
 @pytest.mark.parametrize(
     "pcc",
     ((0.99,),),
 )
-def test_nanogpt_mlp(device, pcc, reset_seeds):
-
+def test_nanogpt_mlp(device, pcc, dtype, reset_seeds):
     model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
-    sd = model_hf.state_dict()
     config = model_hf.config
     model_hf.eval()
     block = 0
     base_address = f"transformer.h.{block}.mlp"
 
     test_in = torch.rand(1, 43, 768)
     tt_test_in = torch_to_tt_tensor_rm(test_in, device)
-    tt_mlp = nanogpt_mlp.TtMLP(base_address, config, sd, device)
+    tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"
+    tt_mlp = nanogpt_mlp.TtMLP(base_address, config, device, tt_cache_path, dtype)
 
     tt_out = tt_mlp.forward(tt_test_in)
 

diff --git a/models/experimental/nanogpt/tests/test_nanogpt_model_real.py b/models/experimental/nanogpt/tests/test_nanogpt_model_real.py
@@ -2,7 +2,7 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
+import tt_lib
 import pytest
 
 from transformers import GPT2Tokenizer, GPT2LMHeadModel
@@ -13,17 +13,18 @@
 from models.utility_functions import tt_to_torch_tensor, comp_allclose, comp_pcc
 
 
-
+@pytest.mark.parametrize(
+    "dtype",
+    (tt_lib.tensor.DataType.BFLOAT16,),
+)
 @pytest.mark.parametrize(
     "pcc, prompt",
-    ((0.99, "Hello, my dog is a little"),),
+    ((0.98, "Hello, my dog is a little"),),
 )
-def test_nanogpt_model_real(device, pcc, prompt, reset_seeds):
-
+def test_nanogpt_model_real(device, pcc, prompt, dtype, reset_seeds):
     # Prepare input
     model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    sd = model_hf.state_dict()
     model_hf.eval()
 
     inputs = tokenizer(prompt, return_tensors="pt", padding=False)
@@ -33,7 +34,9 @@ def test_nanogpt_model_real(device, pcc, prompt, reset_seeds):
 
     config = model_hf.config
 
-    tt_model = nanogpt_model.TtGPT(config, sd, device)
+    tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"
+
+    tt_model = nanogpt_model.TtGPT(config, device, tt_cache_path, dtype)
 
     tt_out = tt_model.forward(inputs.input_ids)
 

diff --git a/models/experimental/nanogpt/tt/nanogpt.py b/models/experimental/nanogpt/tt/nanogpt.py
@@ -0,0 +1,24 @@
+# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.
+
+# SPDX-License-Identifier: Apache-2.0
+
+from transformers import GPT2LMHeadModel
+from models.experimental.nanogpt.tt.nanogpt_model import TtGPT
+
+
+def _nanogpt(config, device, tt_cache_path, dtype):
+    return TtGPT(
+        config=config,
+        device=device,
+        tt_cache_path=tt_cache_path,
+        dtype=dtype,
+    )
+
+
+def nanogpt_model(device, dtype) -> TtGPT:
+    model_name = "gpt2"
+    model = GPT2LMHeadModel.from_pretrained(model_name)
+    config = model.config
+    tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"
+    model = _nanogpt(config, device, tt_cache_path, dtype)
+    return model
diff --git a/models/experimental/nanogpt/tt/nanogpt_attention.py b/models/experimental/nanogpt/tt/nanogpt_attention.py
@@ -2,7 +2,6 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
-import torch
 import torch.nn as nn
 import tt_lib
 import math
@@ -16,7 +15,7 @@
 
 
 class TtCausalSelfAttention(nn.Module):
-    def __init__(self, config, state_dict, base_address, device):
+    def __init__(self, config, base_address, device, tt_cache_path, dtype):
         super().__init__()
         assert config.n_embd % config.n_head == 0
 
@@ -25,28 +24,34 @@ def __init__(self, config, state_dict, base_address, device):
 
         self.device = device
         # Get the weights
-        self.tt_weight_c_attn = state_dict[f"{base_address}.c_attn.weight"]
-        self.tt_weight_c_proj = state_dict[f"{base_address}.c_proj.weight"]
-
-        # Push weights to Ttp device
-        self.tt_weight_c_attn = torch_to_tt_tensor_rm(self.tt_weight_c_attn, self.device)
+        self.tt_weight_c_attn = tt_lib.tensor.load_tensor(
+            tt_cache_path + base_address + ".c_attn.weight" + str(dtype) + ".bin"
+        )
 
-        self.tt_weight_c_proj = torch_to_tt_tensor_rm(self.tt_weight_c_proj, self.device)
+        self.tt_weight_c_proj = tt_lib.tensor.load_tensor(
+            tt_cache_path + base_address + ".c_proj.weight" + str(dtype) + ".bin"
+        )
 
         self.tt_weight_c_attn = tt_lib.tensor.transpose(self.tt_weight_c_attn, -2, -1)
         self.tt_weight_c_proj = tt_lib.tensor.transpose(self.tt_weight_c_proj, -2, -1)
 
         # Load biases
-        self.tt_bias_c_attn = torch_to_tt_tensor_rm(state_dict[f"{base_address}.c_attn.bias"], self.device)
+        self.tt_bias_c_attn = tt_lib.tensor.load_tensor(
+            tt_cache_path + base_address + ".c_attn.bias" + str(dtype) + ".bin"
+        )
 
-        self.tt_bias_c_proj = torch_to_tt_tensor_rm(state_dict[f"{base_address}.c_proj.bias"], self.device)
+        self.tt_bias_c_proj = tt_lib.tensor.load_tensor(
+            tt_cache_path + base_address + ".c_proj.bias" + str(dtype) + ".bin"
+        )
 
         self.n_head = self.config.n_head
         self.n_embd = self.config.n_embd
 
+        temp_bias = tt_lib.tensor.tril(tt_lib.tensor.ones([1, 1, self.block_size, self.block_size]))
+        temp_bias = tt_to_torch_tensor(temp_bias)
         self.register_buffer(
             "bias",
-            torch.tril(torch.ones(self.block_size, self.block_size)).view(1, 1, self.block_size, self.block_size),
+            temp_bias,
         )
 
         self.c_attn = Linear(
@@ -82,16 +87,16 @@ def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor:
         q, k, v = pt_x1.split(self.n_embd, dim=2)
 
         k = torch_to_tt_tensor_rm(k, self.device)
-        k = fallback_ops.reshape(k, B, T, self.n_head, C // self.n_head)
+        k = tt_lib.tensor.reshape(k, B, T, self.n_head, C // self.n_head)
         k = tt_lib.tensor.transpose(k, 1, 2)
 
         q = torch_to_tt_tensor_rm(q, self.device)
-        q = fallback_ops.reshape(q, B, T, self.n_head, C // self.n_head)
+        q = tt_lib.tensor.reshape(q, B, T, self.n_head, C // self.n_head)
         q = tt_lib.tensor.transpose(q, 1, 2)
 
         v = torch_to_tt_tensor_rm(v, self.device)
 
-        v = fallback_ops.reshape(v, B, T, self.n_head, C // self.n_head)
+        v = tt_lib.tensor.reshape(v, B, T, self.n_head, C // self.n_head)
         v = tt_lib.tensor.transpose(v, 1, 2)
 
         # manual implementation of attention
@@ -107,12 +112,14 @@ def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor:
 
         tt_att = torch_to_tt_tensor_rm(att, self.device, put_on_device=False)
 
-        tt_att = fallback_ops.softmax(tt_att, dim=-1)
+        tt_att = tt_lib.tensor.softmax(
+            tt_att
+        )  # Using tt_lib.tensor.softmax reduces pcc from 0.99 to 0.98 for whole model
 
         tt_y = tt_lib.tensor.bmm(tt_att, v)
 
         tt_y = tt_lib.tensor.transpose(tt_y, 1, -2)
-        tt_y = fallback_ops.reshape(tt_y, 1, B, T, C)
+        tt_y = tt_lib.tensor.reshape(tt_y, 1, B, T, C)
 
         # output projection
         x2 = self.c_proj(tt_y)