From 7d782bfd317cfacfaad0faed76afb4b0b06ebe37 Mon Sep 17 00:00:00 2001 From: punithsekar Date: Mon, 27 Nov 2023 14:22:37 +0000 Subject: [PATCH] #3974: nanogpt uplift and move weights to weka path --- models/experimental/nanogpt/nanogpt_utils.py | 60 +++++++++++ .../nanogpt/tests/test_nanogpt_attention.py | 15 +-- .../nanogpt/tests/test_nanogpt_block.py | 12 ++- .../nanogpt/tests/test_nanogpt_mlp.py | 13 ++- .../nanogpt/tests/test_nanogpt_model_real.py | 17 +-- models/experimental/nanogpt/tt/nanogpt.py | 24 +++++ .../nanogpt/tt/nanogpt_attention.py | 39 ++++--- .../experimental/nanogpt/tt/nanogpt_block.py | 37 ++----- models/experimental/nanogpt/tt/nanogpt_mlp.py | 26 +++-- .../experimental/nanogpt/tt/nanogpt_model.py | 102 ++++++++++++++---- tests/scripts/run_performance.sh | 2 + 11 files changed, 247 insertions(+), 100 deletions(-) create mode 100644 models/experimental/nanogpt/nanogpt_utils.py create mode 100644 models/experimental/nanogpt/tt/nanogpt.py diff --git a/models/experimental/nanogpt/nanogpt_utils.py b/models/experimental/nanogpt/nanogpt_utils.py new file mode 100644 index 00000000000..500f6057d96 --- /dev/null +++ b/models/experimental/nanogpt/nanogpt_utils.py @@ -0,0 +1,60 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from models.utility_functions import tt2torch_tensor +import torch +import tt_lib +from transformers import GPT2LMHeadModel +from tt_lib.utils import pad_weight + + +def unpad_from_zero(x, desired_shape): + if x.shape()[-1] == desired_shape[-1] and x.shape()[-2] == desired_shape[-2]: + x = tt2torch_tensor(x) + else: + x = x.cpu() + if x.layout() != tt_lib.tensor.Layout.ROW_MAJOR: + x = x.to(tt_lib.tensor.Layout.ROW_MAJOR) + x = x.unpad( + (0, 0, 0, 0), (desired_shape[0] - 1, desired_shape[1] - 1, desired_shape[2] - 1, desired_shape[3] - 1) + ) + x = x.to_torch().to(torch.float) + return x + + +def cache_weights_in_weka(device, dtype, reset_seeds): + model_hf = GPT2LMHeadModel.from_pretrained("gpt2") + state_dict = model_hf.state_dict() + weights_dtype = dtype + + # initial weights are stored in "models/experimental/nanogpt/weights/" and moved to weka path + file_name = "models/experimental/nanogpt/weights/" + for key, value in state_dict.items(): + if key.startswith("transformer.wte.") or key.startswith("transformer.wpe."): + torch.save(value, file_name + str(key) + ".pt") + continue + elif len(value.shape) == 0: + continue + if len(value.shape) == 1: + value = value.unsqueeze(0).unsqueeze(0).unsqueeze(0) + elif len(value.shape) == 3: + value = value.unsqueeze(0) + elif len(value.shape) == 2: + value = value.unsqueeze(0).unsqueeze(0) + if value.shape[-2] % 32 == 0 and value.shape[-1] % 32 == 0: + value = tt_lib.tensor.Tensor( + value.reshape(-1).tolist(), + value.shape, + weights_dtype, + tt_lib.tensor.Layout.ROW_MAJOR, + ).to(tt_lib.tensor.Layout.TILE) + else: + value = pad_weight(value) + value = tt_lib.tensor.Tensor( + value.reshape(-1).tolist(), + value.shape, + weights_dtype, + tt_lib.tensor.Layout.ROW_MAJOR, + ).to(tt_lib.tensor.Layout.TILE) + tt_lib.tensor.dump_tensor(file_name + str(key) + str(weights_dtype) + ".bin", value) diff --git a/models/experimental/nanogpt/tests/test_nanogpt_attention.py b/models/experimental/nanogpt/tests/test_nanogpt_attention.py index 67c3569d22c..0ef88343c47 100644 --- a/models/experimental/nanogpt/tests/test_nanogpt_attention.py +++ b/models/experimental/nanogpt/tests/test_nanogpt_attention.py @@ -4,10 +4,10 @@ import torch import pytest +import tt_lib from transformers import GPT2LMHeadModel - from loguru import logger import models.experimental.nanogpt.tt.nanogpt_attention as nanogpt_attention @@ -19,16 +19,17 @@ ) +@pytest.mark.parametrize( + "dtype", + (tt_lib.tensor.DataType.BFLOAT16,), +) @pytest.mark.parametrize( "pcc", ((0.99,),), ) - -def test_nanogpt_attn(device, pcc, reset_seeds): - +def test_nanogpt_attn(device, pcc, dtype, reset_seeds): # Prepare input model_hf = GPT2LMHeadModel.from_pretrained("gpt2") - sd = model_hf.state_dict() config = model_hf.config model_hf.eval() block = 0 @@ -38,8 +39,10 @@ def test_nanogpt_attn(device, pcc, reset_seeds): pt_attn = model_hf.transformer.h[block].attn pt_out = pt_attn.forward(test_in) + tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/" + tt_test_in = torch_to_tt_tensor_rm(test_in, device) - tt_attn = nanogpt_attention.TtCausalSelfAttention(config, sd, base_address, device) + tt_attn = nanogpt_attention.TtCausalSelfAttention(config, base_address, device, tt_cache_path, dtype) tt_out = tt_attn.forward(tt_test_in) diff --git a/models/experimental/nanogpt/tests/test_nanogpt_block.py b/models/experimental/nanogpt/tests/test_nanogpt_block.py index e22b0ab0187..43af815f18d 100644 --- a/models/experimental/nanogpt/tests/test_nanogpt_block.py +++ b/models/experimental/nanogpt/tests/test_nanogpt_block.py @@ -4,6 +4,7 @@ import torch import pytest +import tt_lib from transformers import GPT2LMHeadModel @@ -18,14 +19,16 @@ ) +@pytest.mark.parametrize( + "dtype", + (tt_lib.tensor.DataType.BFLOAT16,), +) @pytest.mark.parametrize( "pcc", ((0.99,),), ) -def test_nanogpt_block(device, pcc, reset_seeds): - +def test_nanogpt_block(device, pcc, dtype, reset_seeds): model_hf = GPT2LMHeadModel.from_pretrained("gpt2") - sd = model_hf.state_dict() config = model_hf.config model_hf.eval() block = 0 @@ -36,8 +39,9 @@ def test_nanogpt_block(device, pcc, reset_seeds): pt_out = pt_block.forward(test_in) tt_test_in = torch_to_tt_tensor_rm(test_in, device) + tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/" - tt_block = nanogpt_block.TtBlock(config, sd, base_address, device) + tt_block = nanogpt_block.TtBlock(config, base_address, device, tt_cache_path, dtype) tt_block.eval() tt_out = tt_block.forward(tt_test_in) diff --git a/models/experimental/nanogpt/tests/test_nanogpt_mlp.py b/models/experimental/nanogpt/tests/test_nanogpt_mlp.py index 466f102e575..e50629ed64d 100644 --- a/models/experimental/nanogpt/tests/test_nanogpt_mlp.py +++ b/models/experimental/nanogpt/tests/test_nanogpt_mlp.py @@ -4,13 +4,13 @@ import torch import pytest +import tt_lib from transformers import GPT2LMHeadModel from loguru import logger import models.experimental.nanogpt.tt.nanogpt_mlp as nanogpt_mlp - from models.utility_functions import ( tt_to_torch_tensor, torch_to_tt_tensor_rm, @@ -19,14 +19,16 @@ ) +@pytest.mark.parametrize( + "dtype", + (tt_lib.tensor.DataType.BFLOAT16,), +) @pytest.mark.parametrize( "pcc", ((0.99,),), ) -def test_nanogpt_mlp(device, pcc, reset_seeds): - +def test_nanogpt_mlp(device, pcc, dtype, reset_seeds): model_hf = GPT2LMHeadModel.from_pretrained("gpt2") - sd = model_hf.state_dict() config = model_hf.config model_hf.eval() block = 0 @@ -34,7 +36,8 @@ def test_nanogpt_mlp(device, pcc, reset_seeds): test_in = torch.rand(1, 43, 768) tt_test_in = torch_to_tt_tensor_rm(test_in, device) - tt_mlp = nanogpt_mlp.TtMLP(base_address, config, sd, device) + tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/" + tt_mlp = nanogpt_mlp.TtMLP(base_address, config, device, tt_cache_path, dtype) tt_out = tt_mlp.forward(tt_test_in) diff --git a/models/experimental/nanogpt/tests/test_nanogpt_model_real.py b/models/experimental/nanogpt/tests/test_nanogpt_model_real.py index 0217cfa58da..64ed66b232c 100644 --- a/models/experimental/nanogpt/tests/test_nanogpt_model_real.py +++ b/models/experimental/nanogpt/tests/test_nanogpt_model_real.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 -import torch +import tt_lib import pytest from transformers import GPT2Tokenizer, GPT2LMHeadModel @@ -13,17 +13,18 @@ from models.utility_functions import tt_to_torch_tensor, comp_allclose, comp_pcc - +@pytest.mark.parametrize( + "dtype", + (tt_lib.tensor.DataType.BFLOAT16,), +) @pytest.mark.parametrize( "pcc, prompt", - ((0.99, "Hello, my dog is a little"),), + ((0.98, "Hello, my dog is a little"),), ) -def test_nanogpt_model_real(device, pcc, prompt, reset_seeds): - +def test_nanogpt_model_real(device, pcc, prompt, dtype, reset_seeds): # Prepare input model_hf = GPT2LMHeadModel.from_pretrained("gpt2") tokenizer = GPT2Tokenizer.from_pretrained("gpt2") - sd = model_hf.state_dict() model_hf.eval() inputs = tokenizer(prompt, return_tensors="pt", padding=False) @@ -33,7 +34,9 @@ def test_nanogpt_model_real(device, pcc, prompt, reset_seeds): config = model_hf.config - tt_model = nanogpt_model.TtGPT(config, sd, device) + tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/" + + tt_model = nanogpt_model.TtGPT(config, device, tt_cache_path, dtype) tt_out = tt_model.forward(inputs.input_ids) diff --git a/models/experimental/nanogpt/tt/nanogpt.py b/models/experimental/nanogpt/tt/nanogpt.py new file mode 100644 index 00000000000..7b2a0ca95af --- /dev/null +++ b/models/experimental/nanogpt/tt/nanogpt.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc. + +# SPDX-License-Identifier: Apache-2.0 + +from transformers import GPT2LMHeadModel +from models.experimental.nanogpt.tt.nanogpt_model import TtGPT + + +def _nanogpt(config, device, tt_cache_path, dtype): + return TtGPT( + config=config, + device=device, + tt_cache_path=tt_cache_path, + dtype=dtype, + ) + + +def nanogpt_model(device, dtype) -> TtGPT: + model_name = "gpt2" + model = GPT2LMHeadModel.from_pretrained(model_name) + config = model.config + tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/" + model = _nanogpt(config, device, tt_cache_path, dtype) + return model diff --git a/models/experimental/nanogpt/tt/nanogpt_attention.py b/models/experimental/nanogpt/tt/nanogpt_attention.py index eb37adf6624..57f148ba097 100644 --- a/models/experimental/nanogpt/tt/nanogpt_attention.py +++ b/models/experimental/nanogpt/tt/nanogpt_attention.py @@ -2,7 +2,6 @@ # SPDX-License-Identifier: Apache-2.0 -import torch import torch.nn as nn import tt_lib import math @@ -16,7 +15,7 @@ class TtCausalSelfAttention(nn.Module): - def __init__(self, config, state_dict, base_address, device): + def __init__(self, config, base_address, device, tt_cache_path, dtype): super().__init__() assert config.n_embd % config.n_head == 0 @@ -25,28 +24,34 @@ def __init__(self, config, state_dict, base_address, device): self.device = device # Get the weights - self.tt_weight_c_attn = state_dict[f"{base_address}.c_attn.weight"] - self.tt_weight_c_proj = state_dict[f"{base_address}.c_proj.weight"] - - # Push weights to Ttp device - self.tt_weight_c_attn = torch_to_tt_tensor_rm(self.tt_weight_c_attn, self.device) + self.tt_weight_c_attn = tt_lib.tensor.load_tensor( + tt_cache_path + base_address + ".c_attn.weight" + str(dtype) + ".bin" + ) - self.tt_weight_c_proj = torch_to_tt_tensor_rm(self.tt_weight_c_proj, self.device) + self.tt_weight_c_proj = tt_lib.tensor.load_tensor( + tt_cache_path + base_address + ".c_proj.weight" + str(dtype) + ".bin" + ) self.tt_weight_c_attn = tt_lib.tensor.transpose(self.tt_weight_c_attn, -2, -1) self.tt_weight_c_proj = tt_lib.tensor.transpose(self.tt_weight_c_proj, -2, -1) # Load biases - self.tt_bias_c_attn = torch_to_tt_tensor_rm(state_dict[f"{base_address}.c_attn.bias"], self.device) + self.tt_bias_c_attn = tt_lib.tensor.load_tensor( + tt_cache_path + base_address + ".c_attn.bias" + str(dtype) + ".bin" + ) - self.tt_bias_c_proj = torch_to_tt_tensor_rm(state_dict[f"{base_address}.c_proj.bias"], self.device) + self.tt_bias_c_proj = tt_lib.tensor.load_tensor( + tt_cache_path + base_address + ".c_proj.bias" + str(dtype) + ".bin" + ) self.n_head = self.config.n_head self.n_embd = self.config.n_embd + temp_bias = tt_lib.tensor.tril(tt_lib.tensor.ones([1, 1, self.block_size, self.block_size])) + temp_bias = tt_to_torch_tensor(temp_bias) self.register_buffer( "bias", - torch.tril(torch.ones(self.block_size, self.block_size)).view(1, 1, self.block_size, self.block_size), + temp_bias, ) self.c_attn = Linear( @@ -82,16 +87,16 @@ def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor: q, k, v = pt_x1.split(self.n_embd, dim=2) k = torch_to_tt_tensor_rm(k, self.device) - k = fallback_ops.reshape(k, B, T, self.n_head, C // self.n_head) + k = tt_lib.tensor.reshape(k, B, T, self.n_head, C // self.n_head) k = tt_lib.tensor.transpose(k, 1, 2) q = torch_to_tt_tensor_rm(q, self.device) - q = fallback_ops.reshape(q, B, T, self.n_head, C // self.n_head) + q = tt_lib.tensor.reshape(q, B, T, self.n_head, C // self.n_head) q = tt_lib.tensor.transpose(q, 1, 2) v = torch_to_tt_tensor_rm(v, self.device) - v = fallback_ops.reshape(v, B, T, self.n_head, C // self.n_head) + v = tt_lib.tensor.reshape(v, B, T, self.n_head, C // self.n_head) v = tt_lib.tensor.transpose(v, 1, 2) # manual implementation of attention @@ -107,12 +112,14 @@ def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor: tt_att = torch_to_tt_tensor_rm(att, self.device, put_on_device=False) - tt_att = fallback_ops.softmax(tt_att, dim=-1) + tt_att = tt_lib.tensor.softmax( + tt_att + ) # Using tt_lib.tensor.softmax reduces pcc from 0.99 to 0.98 for whole model tt_y = tt_lib.tensor.bmm(tt_att, v) tt_y = tt_lib.tensor.transpose(tt_y, 1, -2) - tt_y = fallback_ops.reshape(tt_y, 1, B, T, C) + tt_y = tt_lib.tensor.reshape(tt_y, 1, B, T, C) # output projection x2 = self.c_proj(tt_y) diff --git a/models/experimental/nanogpt/tt/nanogpt_block.py b/models/experimental/nanogpt/tt/nanogpt_block.py index 349b9f48c75..1606ab2b42d 100644 --- a/models/experimental/nanogpt/tt/nanogpt_block.py +++ b/models/experimental/nanogpt/tt/nanogpt_block.py @@ -8,55 +8,36 @@ import models.experimental.nanogpt.tt.nanogpt_attention as nanogpt_attention -from models.utility_functions import ( - torch_to_tt_tensor_rm, -) - - class TtBlock(nn.Module): - def __init__(self, config, state_dict, base_address, device): + def __init__(self, config, base_address, device, tt_cache_path, dtype): super().__init__() self.device = device self.config = config - self.beta_1 = torch_to_tt_tensor_rm( - state_dict[f"{base_address}.ln_1.bias"], self.device - ) + self.beta_1 = tt_lib.tensor.load_tensor(tt_cache_path + base_address + ".ln_1.bias" + str(dtype) + ".bin") - self.gamma_1 = torch_to_tt_tensor_rm( - state_dict[f"{base_address}.ln_1.weight"], self.device - ) + self.gamma_1 = tt_lib.tensor.load_tensor(tt_cache_path + base_address + ".ln_1.weight" + str(dtype) + ".bin") self.ln_1 = tt_lib.tensor.layernorm self.attn = nanogpt_attention.TtCausalSelfAttention( - config, state_dict, f"{base_address}.attn", device + config, f"{base_address}.attn", device, tt_cache_path, dtype ) - self.beta_2 = torch_to_tt_tensor_rm( - state_dict[f"{base_address}.ln_2.bias"], self.device - ) + self.beta_2 = tt_lib.tensor.load_tensor(tt_cache_path + base_address + ".ln_2.bias" + str(dtype) + ".bin") - self.gamma_2 = torch_to_tt_tensor_rm( - state_dict[f"{base_address}.ln_2.weight"], self.device - ) + self.gamma_2 = tt_lib.tensor.load_tensor(tt_cache_path + base_address + ".ln_2.weight" + str(dtype) + ".bin") self.ln_2 = tt_lib.tensor.layernorm - self.mlp = nanogpt_mlp.TtMLP( - f"{base_address}.mlp", self.config, state_dict, device - ) + self.mlp = nanogpt_mlp.TtMLP(f"{base_address}.mlp", self.config, device, tt_cache_path, dtype) def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor: - tmp = self.attn.forward( - self.ln_1(x, eps=1e-5, gamma=self.gamma_1, beta=self.beta_1) - ) + tmp = self.attn.forward(self.ln_1(x, eps=1e-5, gamma=self.gamma_1, beta=self.beta_1)) x = tt_lib.tensor.add(x, tmp) - tmp = self.mlp.forward( - self.ln_2(x, eps=1e-5, gamma=self.gamma_2, beta=self.beta_2) - ) + tmp = self.mlp.forward(self.ln_2(x, eps=1e-5, gamma=self.gamma_2, beta=self.beta_2)) x = tt_lib.tensor.add(x, tmp) return x diff --git a/models/experimental/nanogpt/tt/nanogpt_mlp.py b/models/experimental/nanogpt/tt/nanogpt_mlp.py index f97c5b49b9f..fa901c928b5 100644 --- a/models/experimental/nanogpt/tt/nanogpt_mlp.py +++ b/models/experimental/nanogpt/tt/nanogpt_mlp.py @@ -6,29 +6,27 @@ import tt_lib from models.helper_funcs import Linear -from models.utility_functions import ( - torch_to_tt_tensor_rm, -) - class TtMLP(torch.nn.Module): - def __init__(self, base_address, config, state_dict, device): + def __init__(self, base_address, config, device, tt_cache_path, dtype): super().__init__() # Get the weights - self.tt_weight_c_fc = state_dict[f"{base_address}.c_fc.weight"] - self.tt_weight_c_proj = state_dict[f"{base_address}.c_proj.weight"] + self.tt_weight_c_fc = tt_lib.tensor.load_tensor( + tt_cache_path + base_address + ".c_fc.weight" + str(dtype) + ".bin" + ) + self.tt_weight_c_proj = tt_lib.tensor.load_tensor( + tt_cache_path + base_address + ".c_proj.weight" + str(dtype) + ".bin" + ) + self.config = config self.device = device - # Push weights to Tt device - self.tt_weight_c_fc = torch_to_tt_tensor_rm(self.tt_weight_c_fc, self.device) - - self.tt_weight_c_proj = torch_to_tt_tensor_rm(self.tt_weight_c_proj, self.device) - # Load biases - self.tt_bias_c_fc = torch_to_tt_tensor_rm(state_dict[f"{base_address}.c_fc.bias"], self.device) + self.tt_bias_c_fc = tt_lib.tensor.load_tensor(tt_cache_path + base_address + ".c_fc.bias" + str(dtype) + ".bin") - self.tt_bias_c_proj = torch_to_tt_tensor_rm(state_dict[f"{base_address}.c_proj.bias"], self.device) + self.tt_bias_c_proj = tt_lib.tensor.load_tensor( + tt_cache_path + base_address + ".c_proj.bias" + str(dtype) + ".bin" + ) self.tt_weight_c_fc = tt_lib.tensor.transpose(self.tt_weight_c_fc, -2, -1) self.tt_weight_c_proj = tt_lib.tensor.transpose(self.tt_weight_c_proj, -2, -1) diff --git a/models/experimental/nanogpt/tt/nanogpt_model.py b/models/experimental/nanogpt/tt/nanogpt_model.py index 03d16aa654e..eabe070c7bd 100644 --- a/models/experimental/nanogpt/tt/nanogpt_model.py +++ b/models/experimental/nanogpt/tt/nanogpt_model.py @@ -6,16 +6,19 @@ import torch.nn as nn import tt_lib from models.helper_funcs import Linear +import tt_lib.fallback_ops as fallback_ops import models.experimental.nanogpt.tt.nanogpt_block as nanogpt_block +from models.experimental.nanogpt.nanogpt_utils import unpad_from_zero from models.utility_functions import ( torch_to_tt_tensor_rm, + tt_to_torch_tensor, ) class TtGPT(nn.Module): - def __init__(self, config, state_dict, device): + def __init__(self, config, device, tt_cache_path, dtype): super().__init__() assert config.vocab_size is not None @@ -24,48 +27,47 @@ def __init__(self, config, state_dict, device): self.config.block_size = 1024 base_address = f"transformer" self.device = device - self.beta = torch_to_tt_tensor_rm( - state_dict[f"{base_address}.ln_f.bias"], self.device - ) - self.gamma = torch_to_tt_tensor_rm( - state_dict[f"{base_address}.ln_f.weight"], self.device - ) + self.beta = tt_lib.tensor.load_tensor(tt_cache_path + base_address + ".ln_f.bias" + str(dtype) + ".bin") + + self.gamma = tt_lib.tensor.load_tensor(tt_cache_path + base_address + ".ln_f.weight" + str(dtype) + ".bin") self.wte = nn.Embedding(config.vocab_size, config.n_embd) self.wpe = nn.Embedding(self.config.block_size, config.n_embd) - self.wte.weight = torch.nn.Parameter(state_dict[f"{base_address}.wte.weight"]) + self.wte.weight = torch.nn.Parameter(torch.load(tt_cache_path + "transformer.wte.weight.pt")) - self.wpe.weight = torch.nn.Parameter(state_dict[f"{base_address}.wpe.weight"]) + self.wpe.weight = torch.nn.Parameter(torch.load(tt_cache_path + "transformer.wpe.weight.pt")) blocks = [] for i in range(config.n_layer): - block = nanogpt_block.TtBlock( - self.config, state_dict, f"{base_address}.h.{i}", self.device - ) + block = nanogpt_block.TtBlock(self.config, f"{base_address}.h.{i}", self.device, tt_cache_path, dtype) blocks.append(block) self.h = nn.ModuleList(blocks) self.ln_f = tt_lib.tensor.layernorm - # Push weights to Tt device - tt_lm_weight = torch_to_tt_tensor_rm(state_dict["lm_head.weight"], self.device) + tt_lm_weight = tt_lib.tensor.load_tensor(tt_cache_path + "lm_head.weight" + str(dtype) + ".bin") + + weight = unpad_from_zero(tt_lm_weight, (1, 1, self.config.vocab_size, self.config.n_embd)) + weight_torch = weight + weight = torch_to_tt_tensor_rm(weight, device=self.device) - self.lm_head = Linear(self.config.n_embd, self.config.vocab_size, tt_lm_weight) + self.lm_head = Linear(self.config.n_embd, self.config.vocab_size, weight) - self.wte.weight = nn.Parameter( - state_dict["lm_head.weight"] - ) # https://paperswithcode.com/method/weight-tying + self.wte.weight = nn.Parameter(weight_torch.squeeze()) # https://paperswithcode.com/method/weight-tying def forward(self, idx: torch.Tensor) -> tt_lib.tensor.Tensor: b, t = idx.shape assert ( t <= self.config.block_size ), f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}" - pos = torch.arange(0, t, dtype=torch.long).unsqueeze(0) # shape (1, t) + pos = tt_lib.tensor.arange(0, t, 1) + pos = tt_to_torch_tensor(pos) + pos = pos.squeeze(0).squeeze(0) + pos = pos.to(dtype=torch.int64) # forward the GPT model itself tok_emb = self.wte(idx) # token embeddings of shape (b, t, n_embd) @@ -74,7 +76,13 @@ def forward(self, idx: torch.Tensor) -> tt_lib.tensor.Tensor: tt_tok_emb = torch_to_tt_tensor_rm(tok_emb, self.device) tt_pos_emb = torch_to_tt_tensor_rm(pos_emb, self.device) - tt_x = tt_lib.tensor.add(tt_tok_emb, tt_pos_emb) + tt_tok_emb = tt_lib.tensor.permute(tt_tok_emb, (0, 2, 1, 3)) + tt_pos_emb = tt_lib.tensor.permute(tt_pos_emb, (0, 2, 1, 3)) + + tt_x = tt_lib.tensor.bcast(tt_tok_emb, tt_pos_emb, tt_lib.tensor.BcastOpMath.ADD, tt_lib.tensor.BcastOpDim.H) + tt_tok_emb.deallocate() + tt_pos_emb.deallocate() + tt_x = tt_lib.tensor.permute(tt_x, (0, 2, 1, 3)) for block in self.h: tt_x = block.forward(tt_x) @@ -83,3 +91,57 @@ def forward(self, idx: torch.Tensor) -> tt_lib.tensor.Tensor: logits = self.lm_head(tt_x) return logits + + def generate( + self, + idx: torch.Tensor, + max_new_tokens: int = 20, + temperature: int = 1.0, + top_k=None, + ) -> torch.Tensor: + """ + Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete + the sequence max_new_tokens times, feeding the predictions back into the model each time. + Most likely you'll want to make sure to be in model.eval() mode of operation for this. + """ + for _ in range(max_new_tokens): + # if the sequence context is growing too long we must crop it at block_size + idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size :] + # forward the model to get the logits for the index in the sequence + tt_logits = self.forward(idx_cond) + + logits_shapes = tt_logits.shape() + + slice_list = [ + slice(None), + slice(None), + slice(logits_shapes[2] - 1, logits_shapes[2]), + slice(None), + ] + tt_logits = fallback_ops.tensor_slice(tt_logits, slice_list) + + tt_temperature = fallback_ops.full(tt_logits.shape(), temperature) + + tt_temperature = tt_lib.tensor.recip(tt_temperature) + tt_logits = tt_lib.tensor.mul(tt_logits, tt_temperature) + + logits = tt_to_torch_tensor(tt_logits) + # optionally crop the logits to only the top k options + if top_k is not None: + v, _ = torch.topk(logits, min(top_k, logits.size(-1))) + logits[logits < v[:, [-1]]] = -float("Inf") + + # apply softmax to convert logits to (normalized) probabilities + tt_logits = torch_to_tt_tensor_rm(logits, self.device, put_on_device=False) + tt_probs = fallback_ops.softmax(tt_logits, dim=-1) + probs = tt_to_torch_tensor(tt_probs) + probs = probs.squeeze(0) + probs = probs.squeeze(0) + + # sample from the distribution + idx_next = torch.multinomial(probs, num_samples=1) + + # append sampled index to the running sequence and continue + idx = torch.cat((idx, idx_next), dim=1) + + return idx diff --git a/tests/scripts/run_performance.sh b/tests/scripts/run_performance.sh index af5a1ced55e..516a6c9d132 100755 --- a/tests/scripts/run_performance.sh +++ b/tests/scripts/run_performance.sh @@ -34,6 +34,8 @@ run_perf_models() { env pytest models/experimental/bloom/tests -m $pipeline_type + env pytest models/experimental/nanogpt/tests -m $pipeline_type + ## Merge all the generated reports env python models/merge_perf_results.py }