Skip to content

Commit

Permalink
#3974: nanogpt uplift and move weights to weka path
Browse files Browse the repository at this point in the history
  • Loading branch information
punithsekar committed Dec 8, 2023
1 parent 5aeccad commit 7d782bf
Show file tree
Hide file tree
Showing 11 changed files with 247 additions and 100 deletions.
60 changes: 60 additions & 0 deletions models/experimental/nanogpt/nanogpt_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

from models.utility_functions import tt2torch_tensor
import torch
import tt_lib
from transformers import GPT2LMHeadModel
from tt_lib.utils import pad_weight


def unpad_from_zero(x, desired_shape):
if x.shape()[-1] == desired_shape[-1] and x.shape()[-2] == desired_shape[-2]:
x = tt2torch_tensor(x)
else:
x = x.cpu()
if x.layout() != tt_lib.tensor.Layout.ROW_MAJOR:
x = x.to(tt_lib.tensor.Layout.ROW_MAJOR)
x = x.unpad(
(0, 0, 0, 0), (desired_shape[0] - 1, desired_shape[1] - 1, desired_shape[2] - 1, desired_shape[3] - 1)
)
x = x.to_torch().to(torch.float)
return x


def cache_weights_in_weka(device, dtype, reset_seeds):
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
state_dict = model_hf.state_dict()
weights_dtype = dtype

# initial weights are stored in "models/experimental/nanogpt/weights/" and moved to weka path
file_name = "models/experimental/nanogpt/weights/"
for key, value in state_dict.items():
if key.startswith("transformer.wte.") or key.startswith("transformer.wpe."):
torch.save(value, file_name + str(key) + ".pt")
continue
elif len(value.shape) == 0:
continue
if len(value.shape) == 1:
value = value.unsqueeze(0).unsqueeze(0).unsqueeze(0)
elif len(value.shape) == 3:
value = value.unsqueeze(0)
elif len(value.shape) == 2:
value = value.unsqueeze(0).unsqueeze(0)
if value.shape[-2] % 32 == 0 and value.shape[-1] % 32 == 0:
value = tt_lib.tensor.Tensor(
value.reshape(-1).tolist(),
value.shape,
weights_dtype,
tt_lib.tensor.Layout.ROW_MAJOR,
).to(tt_lib.tensor.Layout.TILE)
else:
value = pad_weight(value)
value = tt_lib.tensor.Tensor(
value.reshape(-1).tolist(),
value.shape,
weights_dtype,
tt_lib.tensor.Layout.ROW_MAJOR,
).to(tt_lib.tensor.Layout.TILE)
tt_lib.tensor.dump_tensor(file_name + str(key) + str(weights_dtype) + ".bin", value)
15 changes: 9 additions & 6 deletions models/experimental/nanogpt/tests/test_nanogpt_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@

import torch
import pytest
import tt_lib

from transformers import GPT2LMHeadModel


from loguru import logger
import models.experimental.nanogpt.tt.nanogpt_attention as nanogpt_attention

Expand All @@ -19,16 +19,17 @@
)


@pytest.mark.parametrize(
"dtype",
(tt_lib.tensor.DataType.BFLOAT16,),
)
@pytest.mark.parametrize(
"pcc",
((0.99,),),
)

def test_nanogpt_attn(device, pcc, reset_seeds):

def test_nanogpt_attn(device, pcc, dtype, reset_seeds):
# Prepare input
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
sd = model_hf.state_dict()
config = model_hf.config
model_hf.eval()
block = 0
Expand All @@ -38,8 +39,10 @@ def test_nanogpt_attn(device, pcc, reset_seeds):
pt_attn = model_hf.transformer.h[block].attn
pt_out = pt_attn.forward(test_in)

tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"

tt_test_in = torch_to_tt_tensor_rm(test_in, device)
tt_attn = nanogpt_attention.TtCausalSelfAttention(config, sd, base_address, device)
tt_attn = nanogpt_attention.TtCausalSelfAttention(config, base_address, device, tt_cache_path, dtype)

tt_out = tt_attn.forward(tt_test_in)

Expand Down
12 changes: 8 additions & 4 deletions models/experimental/nanogpt/tests/test_nanogpt_block.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import torch
import pytest
import tt_lib

from transformers import GPT2LMHeadModel

Expand All @@ -18,14 +19,16 @@
)


@pytest.mark.parametrize(
"dtype",
(tt_lib.tensor.DataType.BFLOAT16,),
)
@pytest.mark.parametrize(
"pcc",
((0.99,),),
)
def test_nanogpt_block(device, pcc, reset_seeds):

def test_nanogpt_block(device, pcc, dtype, reset_seeds):
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
sd = model_hf.state_dict()
config = model_hf.config
model_hf.eval()
block = 0
Expand All @@ -36,8 +39,9 @@ def test_nanogpt_block(device, pcc, reset_seeds):
pt_out = pt_block.forward(test_in)

tt_test_in = torch_to_tt_tensor_rm(test_in, device)
tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"

tt_block = nanogpt_block.TtBlock(config, sd, base_address, device)
tt_block = nanogpt_block.TtBlock(config, base_address, device, tt_cache_path, dtype)
tt_block.eval()

tt_out = tt_block.forward(tt_test_in)
Expand Down
13 changes: 8 additions & 5 deletions models/experimental/nanogpt/tests/test_nanogpt_mlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,13 @@

import torch
import pytest
import tt_lib

from transformers import GPT2LMHeadModel

from loguru import logger
import models.experimental.nanogpt.tt.nanogpt_mlp as nanogpt_mlp


from models.utility_functions import (
tt_to_torch_tensor,
torch_to_tt_tensor_rm,
Expand All @@ -19,22 +19,25 @@
)


@pytest.mark.parametrize(
"dtype",
(tt_lib.tensor.DataType.BFLOAT16,),
)
@pytest.mark.parametrize(
"pcc",
((0.99,),),
)
def test_nanogpt_mlp(device, pcc, reset_seeds):

def test_nanogpt_mlp(device, pcc, dtype, reset_seeds):
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
sd = model_hf.state_dict()
config = model_hf.config
model_hf.eval()
block = 0
base_address = f"transformer.h.{block}.mlp"

test_in = torch.rand(1, 43, 768)
tt_test_in = torch_to_tt_tensor_rm(test_in, device)
tt_mlp = nanogpt_mlp.TtMLP(base_address, config, sd, device)
tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"
tt_mlp = nanogpt_mlp.TtMLP(base_address, config, device, tt_cache_path, dtype)

tt_out = tt_mlp.forward(tt_test_in)

Expand Down
17 changes: 10 additions & 7 deletions models/experimental/nanogpt/tests/test_nanogpt_model_real.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

# SPDX-License-Identifier: Apache-2.0

import torch
import tt_lib
import pytest

from transformers import GPT2Tokenizer, GPT2LMHeadModel
Expand All @@ -13,17 +13,18 @@
from models.utility_functions import tt_to_torch_tensor, comp_allclose, comp_pcc



@pytest.mark.parametrize(
"dtype",
(tt_lib.tensor.DataType.BFLOAT16,),
)
@pytest.mark.parametrize(
"pcc, prompt",
((0.99, "Hello, my dog is a little"),),
((0.98, "Hello, my dog is a little"),),
)
def test_nanogpt_model_real(device, pcc, prompt, reset_seeds):

def test_nanogpt_model_real(device, pcc, prompt, dtype, reset_seeds):
# Prepare input
model_hf = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
sd = model_hf.state_dict()
model_hf.eval()

inputs = tokenizer(prompt, return_tensors="pt", padding=False)
Expand All @@ -33,7 +34,9 @@ def test_nanogpt_model_real(device, pcc, prompt, reset_seeds):

config = model_hf.config

tt_model = nanogpt_model.TtGPT(config, sd, device)
tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"

tt_model = nanogpt_model.TtGPT(config, device, tt_cache_path, dtype)

tt_out = tt_model.forward(inputs.input_ids)

Expand Down
24 changes: 24 additions & 0 deletions models/experimental/nanogpt/tt/nanogpt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# SPDX-FileCopyrightText: © 2023 Tenstorrent Inc.

# SPDX-License-Identifier: Apache-2.0

from transformers import GPT2LMHeadModel
from models.experimental.nanogpt.tt.nanogpt_model import TtGPT


def _nanogpt(config, device, tt_cache_path, dtype):
return TtGPT(
config=config,
device=device,
tt_cache_path=tt_cache_path,
dtype=dtype,
)


def nanogpt_model(device, dtype) -> TtGPT:
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
config = model.config
tt_cache_path = "/mnt/MLPerf/tt_dnn-models/tt/NanoGPT/"
model = _nanogpt(config, device, tt_cache_path, dtype)
return model
39 changes: 23 additions & 16 deletions models/experimental/nanogpt/tt/nanogpt_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

# SPDX-License-Identifier: Apache-2.0

import torch
import torch.nn as nn
import tt_lib
import math
Expand All @@ -16,7 +15,7 @@


class TtCausalSelfAttention(nn.Module):
def __init__(self, config, state_dict, base_address, device):
def __init__(self, config, base_address, device, tt_cache_path, dtype):
super().__init__()
assert config.n_embd % config.n_head == 0

Expand All @@ -25,28 +24,34 @@ def __init__(self, config, state_dict, base_address, device):

self.device = device
# Get the weights
self.tt_weight_c_attn = state_dict[f"{base_address}.c_attn.weight"]
self.tt_weight_c_proj = state_dict[f"{base_address}.c_proj.weight"]

# Push weights to Ttp device
self.tt_weight_c_attn = torch_to_tt_tensor_rm(self.tt_weight_c_attn, self.device)
self.tt_weight_c_attn = tt_lib.tensor.load_tensor(
tt_cache_path + base_address + ".c_attn.weight" + str(dtype) + ".bin"
)

self.tt_weight_c_proj = torch_to_tt_tensor_rm(self.tt_weight_c_proj, self.device)
self.tt_weight_c_proj = tt_lib.tensor.load_tensor(
tt_cache_path + base_address + ".c_proj.weight" + str(dtype) + ".bin"
)

self.tt_weight_c_attn = tt_lib.tensor.transpose(self.tt_weight_c_attn, -2, -1)
self.tt_weight_c_proj = tt_lib.tensor.transpose(self.tt_weight_c_proj, -2, -1)

# Load biases
self.tt_bias_c_attn = torch_to_tt_tensor_rm(state_dict[f"{base_address}.c_attn.bias"], self.device)
self.tt_bias_c_attn = tt_lib.tensor.load_tensor(
tt_cache_path + base_address + ".c_attn.bias" + str(dtype) + ".bin"
)

self.tt_bias_c_proj = torch_to_tt_tensor_rm(state_dict[f"{base_address}.c_proj.bias"], self.device)
self.tt_bias_c_proj = tt_lib.tensor.load_tensor(
tt_cache_path + base_address + ".c_proj.bias" + str(dtype) + ".bin"
)

self.n_head = self.config.n_head
self.n_embd = self.config.n_embd

temp_bias = tt_lib.tensor.tril(tt_lib.tensor.ones([1, 1, self.block_size, self.block_size]))
temp_bias = tt_to_torch_tensor(temp_bias)
self.register_buffer(
"bias",
torch.tril(torch.ones(self.block_size, self.block_size)).view(1, 1, self.block_size, self.block_size),
temp_bias,
)

self.c_attn = Linear(
Expand Down Expand Up @@ -82,16 +87,16 @@ def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor:
q, k, v = pt_x1.split(self.n_embd, dim=2)

k = torch_to_tt_tensor_rm(k, self.device)
k = fallback_ops.reshape(k, B, T, self.n_head, C // self.n_head)
k = tt_lib.tensor.reshape(k, B, T, self.n_head, C // self.n_head)
k = tt_lib.tensor.transpose(k, 1, 2)

q = torch_to_tt_tensor_rm(q, self.device)
q = fallback_ops.reshape(q, B, T, self.n_head, C // self.n_head)
q = tt_lib.tensor.reshape(q, B, T, self.n_head, C // self.n_head)
q = tt_lib.tensor.transpose(q, 1, 2)

v = torch_to_tt_tensor_rm(v, self.device)

v = fallback_ops.reshape(v, B, T, self.n_head, C // self.n_head)
v = tt_lib.tensor.reshape(v, B, T, self.n_head, C // self.n_head)
v = tt_lib.tensor.transpose(v, 1, 2)

# manual implementation of attention
Expand All @@ -107,12 +112,14 @@ def forward(self, x: tt_lib.tensor.Tensor) -> tt_lib.tensor.Tensor:

tt_att = torch_to_tt_tensor_rm(att, self.device, put_on_device=False)

tt_att = fallback_ops.softmax(tt_att, dim=-1)
tt_att = tt_lib.tensor.softmax(
tt_att
) # Using tt_lib.tensor.softmax reduces pcc from 0.99 to 0.98 for whole model

tt_y = tt_lib.tensor.bmm(tt_att, v)

tt_y = tt_lib.tensor.transpose(tt_y, 1, -2)
tt_y = fallback_ops.reshape(tt_y, 1, B, T, C)
tt_y = tt_lib.tensor.reshape(tt_y, 1, B, T, C)

# output projection
x2 = self.c_proj(tt_y)
Expand Down
Loading

0 comments on commit 7d782bf

Please sign in to comment.