Skip to content

Commit

Permalink
update gpt-2 code
Browse files Browse the repository at this point in the history
  • Loading branch information
akki2825 committed Nov 20, 2024
1 parent 513840e commit ca8bb4e
Showing 1 changed file with 114 additions and 51 deletions.
165 changes: 114 additions & 51 deletions 2024/weeks/week07/gpt2.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@
"source": [
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"context_length\": 1024, # Context length\n",
" \"context_length\": 256, # Context length\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
Expand Down Expand Up @@ -479,47 +479,7 @@
" x = self.drop_shortcut(x)\n",
" x = x + shortcut # Add the original input back\n",
"\n",
" return x\n",
"\n",
"class GPTModel(nn.Module):\n",
" def __init__(self, cfg):\n",
" super().__init__()\n",
" self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
" self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
" self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
"\n",
" self.trf_blocks = nn.Sequential(\n",
" *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
"\n",
" self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
" self.out_head = nn.Linear(\n",
" cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False\n",
" )\n",
"\n",
" def forward(self, in_idx):\n",
" batch_size, seq_len = in_idx.shape\n",
" tok_embeds = self.tok_emb(in_idx)\n",
" pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
" x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n",
" x = self.drop_emb(x)\n",
" x = self.trf_blocks(x)\n",
" x = self.final_norm(x)\n",
" logits = self.out_head(x)\n",
" return logits\n",
"\n",
"GPT_CONFIG_124M = {\n",
" \"vocab_size\": 50257, # Vocabulary size\n",
" \"context_length\": 256, # Shortened context length (orig: 1024)\n",
" \"emb_dim\": 768, # Embedding dimension\n",
" \"n_heads\": 12, # Number of attention heads\n",
" \"n_layers\": 12, # Number of layers\n",
" \"drop_rate\": 0.1, # Dropout rate\n",
" \"qkv_bias\": False # Query-key-value bias\n",
"}\n",
"\n",
"torch.manual_seed(123)\n",
"model = GPTModel(GPT_CONFIG_124M)\n",
"model.eval(); # Disable dropout during inference"
" return x\n"
]
},
{
Expand Down Expand Up @@ -629,6 +589,10 @@
{
"cell_type": "code",
"source": [
"torch.manual_seed(123)\n",
"model = GPTModel(GPT_CONFIG_124M)\n",
"\n",
"\n",
"total_params = sum(p.numel() for p in model.parameters())\n",
"print(f\"Total number of parameters: {total_params:,}\")"
],
Expand Down Expand Up @@ -864,7 +828,8 @@
" logits = model(inputs)\n",
"\n",
"probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary\n",
"print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)"
"print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)\n",
"print(probas)"
]
},
{
Expand Down Expand Up @@ -1708,15 +1673,15 @@
"source": [
"# Note:\n",
"# Uncomment the following code to calculate the execution time\n",
"# import time\n",
"# start_time = time.time()\n",
"import time\n",
"start_time = time.time()\n",
"\n",
"torch.manual_seed(123)\n",
"model = GPTModel(GPT_CONFIG_124M)\n",
"model.to(device)\n",
"optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)\n",
"\n",
"num_epochs = 10\n",
"num_epochs = 25\n",
"train_losses, val_losses, tokens_seen = train_model_simple(\n",
" model, train_loader, val_loader, optimizer, device,\n",
" num_epochs=num_epochs, eval_freq=5, eval_iter=5,\n",
Expand All @@ -1725,8 +1690,8 @@
"\n",
"# Note:\n",
"# Uncomment the following code to show the execution time\n",
"# end_time = time.time()\n",
"# execution_time_minutes = (end_time - start_time) / 60\n",
"end_time = time.time()\n",
"execution_time_minutes = (end_time - start_time) / 60\n",
"# print(f\"Training completed in {execution_time_minutes:.2f} minutes.\")"
]
},
Expand Down Expand Up @@ -1824,7 +1789,6 @@
},
"outputs": [],
"source": [
"model.to(\"cpu\")\n",
"model.eval()\n",
"\n",
"tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
Expand Down Expand Up @@ -2474,8 +2438,107 @@
},
"outputs": [],
"source": [
"# Relative import from the gpt_download.py contained in this folder\n",
"from gpt_download import download_and_load_gpt2"
"import json\n",
"import numpy as np\n",
"import tensorflow as tf\n",
"from tqdm import tqdm\n",
"\n",
"\n",
"def download_and_load_gpt2(model_size, models_dir):\n",
" # Validate model size\n",
" allowed_sizes = (\"124M\", \"355M\", \"774M\", \"1558M\")\n",
" if model_size not in allowed_sizes:\n",
" raise ValueError(f\"Model size not in {allowed_sizes}\")\n",
"\n",
" # Define paths\n",
" model_dir = os.path.join(models_dir, model_size)\n",
" base_url = \"https://openaipublic.blob.core.windows.net/gpt-2/models\"\n",
" filenames = [\n",
" \"checkpoint\", \"encoder.json\", \"hparams.json\",\n",
" \"model.ckpt.data-00000-of-00001\", \"model.ckpt.index\",\n",
" \"model.ckpt.meta\", \"vocab.bpe\"\n",
" ]\n",
"\n",
" # Download files\n",
" os.makedirs(model_dir, exist_ok=True)\n",
" for filename in filenames:\n",
" file_url = os.path.join(base_url, model_size, filename)\n",
" file_path = os.path.join(model_dir, filename)\n",
" download_file(file_url, file_path)\n",
"\n",
" # Load settings and params\n",
" tf_ckpt_path = tf.train.latest_checkpoint(model_dir)\n",
" settings = json.load(open(os.path.join(model_dir, \"hparams.json\")))\n",
" params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)\n",
"\n",
" return settings, params\n",
"\n",
"\n",
"def download_file(url, destination):\n",
" # Send a GET request to download the file\n",
"\n",
" try:\n",
" with urllib.request.urlopen(url) as response:\n",
" # Get the total file size from headers, defaulting to 0 if not present\n",
" file_size = int(response.headers.get(\"Content-Length\", 0))\n",
"\n",
" # Check if file exists and has the same size\n",
" if os.path.exists(destination):\n",
" file_size_local = os.path.getsize(destination)\n",
" if file_size == file_size_local:\n",
" print(f\"File already exists and is up-to-date: {destination}\")\n",
" return\n",
"\n",
" # Define the block size for reading the file\n",
" block_size = 1024 # 1 Kilobyte\n",
"\n",
" # Initialize the progress bar with total file size\n",
" progress_bar_description = os.path.basename(url) # Extract filename from URL\n",
" with tqdm(total=file_size, unit=\"iB\", unit_scale=True, desc=progress_bar_description) as progress_bar:\n",
" # Open the destination file in binary write mode\n",
" with open(destination, \"wb\") as file:\n",
" # Read the file in chunks and write to destination\n",
" while True:\n",
" chunk = response.read(block_size)\n",
" if not chunk:\n",
" break\n",
" file.write(chunk)\n",
" progress_bar.update(len(chunk)) # Update progress bar\n",
" except urllib.error.HTTPError:\n",
" s = (\n",
" f\"The specified URL ({url}) is incorrect, the internet connection cannot be established,\"\n",
" \"\\nor the requested file is temporarily unavailable.\\nPlease visit the following website\"\n",
" \" for help: https://github.com/rasbt/LLMs-from-scratch/discussions/273\")\n",
" print(s)\n",
"\n",
"\n",
"def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):\n",
" # Initialize parameters dictionary with empty blocks for each layer\n",
" params = {\"blocks\": [{} for _ in range(settings[\"n_layer\"])]}\n",
"\n",
" # Iterate over each variable in the checkpoint\n",
" for name, _ in tf.train.list_variables(ckpt_path):\n",
" # Load the variable and remove singleton dimensions\n",
" variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))\n",
"\n",
" # Process the variable name to extract relevant parts\n",
" variable_name_parts = name.split(\"/\")[1:] # Skip the 'model/' prefix\n",
"\n",
" # Identify the target dictionary for the variable\n",
" target_dict = params\n",
" if variable_name_parts[0].startswith(\"h\"):\n",
" layer_number = int(variable_name_parts[0][1:])\n",
" target_dict = params[\"blocks\"][layer_number]\n",
"\n",
" # Recursively access or create nested dictionaries\n",
" for key in variable_name_parts[1:-1]:\n",
" target_dict = target_dict.setdefault(key, {})\n",
"\n",
" # Assign the variable array to the last key\n",
" last_key = variable_name_parts[-1]\n",
" target_dict[last_key] = variable_array\n",
"\n",
" return params"
]
},
{
Expand Down

0 comments on commit ca8bb4e

Please sign in to comment.