diff --git a/2024/weeks/week07/gpt2.ipynb b/2024/weeks/week07/gpt2.ipynb index 97192db..b499bee 100644 --- a/2024/weeks/week07/gpt2.ipynb +++ b/2024/weeks/week07/gpt2.ipynb @@ -118,7 +118,7 @@ "source": [ "GPT_CONFIG_124M = {\n", " \"vocab_size\": 50257, # Vocabulary size\n", - " \"context_length\": 1024, # Context length\n", + " \"context_length\": 256, # Context length\n", " \"emb_dim\": 768, # Embedding dimension\n", " \"n_heads\": 12, # Number of attention heads\n", " \"n_layers\": 12, # Number of layers\n", @@ -479,47 +479,7 @@ " x = self.drop_shortcut(x)\n", " x = x + shortcut # Add the original input back\n", "\n", - " return x\n", - "\n", - "class GPTModel(nn.Module):\n", - " def __init__(self, cfg):\n", - " super().__init__()\n", - " self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n", - " self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n", - " self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n", - "\n", - " self.trf_blocks = nn.Sequential(\n", - " *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n", - "\n", - " self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n", - " self.out_head = nn.Linear(\n", - " cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False\n", - " )\n", - "\n", - " def forward(self, in_idx):\n", - " batch_size, seq_len = in_idx.shape\n", - " tok_embeds = self.tok_emb(in_idx)\n", - " pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n", - " x = tok_embeds + pos_embeds # Shape [batch_size, num_tokens, emb_size]\n", - " x = self.drop_emb(x)\n", - " x = self.trf_blocks(x)\n", - " x = self.final_norm(x)\n", - " logits = self.out_head(x)\n", - " return logits\n", - "\n", - "GPT_CONFIG_124M = {\n", - " \"vocab_size\": 50257, # Vocabulary size\n", - " \"context_length\": 256, # Shortened context length (orig: 1024)\n", - " \"emb_dim\": 768, # Embedding dimension\n", - " \"n_heads\": 12, # Number of attention heads\n", - " \"n_layers\": 12, # Number of layers\n", - " \"drop_rate\": 0.1, # Dropout rate\n", - " \"qkv_bias\": False # Query-key-value bias\n", - "}\n", - "\n", - "torch.manual_seed(123)\n", - "model = GPTModel(GPT_CONFIG_124M)\n", - "model.eval(); # Disable dropout during inference" + " return x\n" ] }, { @@ -629,6 +589,10 @@ { "cell_type": "code", "source": [ + "torch.manual_seed(123)\n", + "model = GPTModel(GPT_CONFIG_124M)\n", + "\n", + "\n", "total_params = sum(p.numel() for p in model.parameters())\n", "print(f\"Total number of parameters: {total_params:,}\")" ], @@ -864,7 +828,8 @@ " logits = model(inputs)\n", "\n", "probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary\n", - "print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)" + "print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)\n", + "print(probas)" ] }, { @@ -1708,15 +1673,15 @@ "source": [ "# Note:\n", "# Uncomment the following code to calculate the execution time\n", - "# import time\n", - "# start_time = time.time()\n", + "import time\n", + "start_time = time.time()\n", "\n", "torch.manual_seed(123)\n", "model = GPTModel(GPT_CONFIG_124M)\n", "model.to(device)\n", "optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)\n", "\n", - "num_epochs = 10\n", + "num_epochs = 25\n", "train_losses, val_losses, tokens_seen = train_model_simple(\n", " model, train_loader, val_loader, optimizer, device,\n", " num_epochs=num_epochs, eval_freq=5, eval_iter=5,\n", @@ -1725,8 +1690,8 @@ "\n", "# Note:\n", "# Uncomment the following code to show the execution time\n", - "# end_time = time.time()\n", - "# execution_time_minutes = (end_time - start_time) / 60\n", + "end_time = time.time()\n", + "execution_time_minutes = (end_time - start_time) / 60\n", "# print(f\"Training completed in {execution_time_minutes:.2f} minutes.\")" ] }, @@ -1824,7 +1789,6 @@ }, "outputs": [], "source": [ - "model.to(\"cpu\")\n", "model.eval()\n", "\n", "tokenizer = tiktoken.get_encoding(\"gpt2\")\n", @@ -2474,8 +2438,107 @@ }, "outputs": [], "source": [ - "# Relative import from the gpt_download.py contained in this folder\n", - "from gpt_download import download_and_load_gpt2" + "import json\n", + "import numpy as np\n", + "import tensorflow as tf\n", + "from tqdm import tqdm\n", + "\n", + "\n", + "def download_and_load_gpt2(model_size, models_dir):\n", + " # Validate model size\n", + " allowed_sizes = (\"124M\", \"355M\", \"774M\", \"1558M\")\n", + " if model_size not in allowed_sizes:\n", + " raise ValueError(f\"Model size not in {allowed_sizes}\")\n", + "\n", + " # Define paths\n", + " model_dir = os.path.join(models_dir, model_size)\n", + " base_url = \"https://openaipublic.blob.core.windows.net/gpt-2/models\"\n", + " filenames = [\n", + " \"checkpoint\", \"encoder.json\", \"hparams.json\",\n", + " \"model.ckpt.data-00000-of-00001\", \"model.ckpt.index\",\n", + " \"model.ckpt.meta\", \"vocab.bpe\"\n", + " ]\n", + "\n", + " # Download files\n", + " os.makedirs(model_dir, exist_ok=True)\n", + " for filename in filenames:\n", + " file_url = os.path.join(base_url, model_size, filename)\n", + " file_path = os.path.join(model_dir, filename)\n", + " download_file(file_url, file_path)\n", + "\n", + " # Load settings and params\n", + " tf_ckpt_path = tf.train.latest_checkpoint(model_dir)\n", + " settings = json.load(open(os.path.join(model_dir, \"hparams.json\")))\n", + " params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)\n", + "\n", + " return settings, params\n", + "\n", + "\n", + "def download_file(url, destination):\n", + " # Send a GET request to download the file\n", + "\n", + " try:\n", + " with urllib.request.urlopen(url) as response:\n", + " # Get the total file size from headers, defaulting to 0 if not present\n", + " file_size = int(response.headers.get(\"Content-Length\", 0))\n", + "\n", + " # Check if file exists and has the same size\n", + " if os.path.exists(destination):\n", + " file_size_local = os.path.getsize(destination)\n", + " if file_size == file_size_local:\n", + " print(f\"File already exists and is up-to-date: {destination}\")\n", + " return\n", + "\n", + " # Define the block size for reading the file\n", + " block_size = 1024 # 1 Kilobyte\n", + "\n", + " # Initialize the progress bar with total file size\n", + " progress_bar_description = os.path.basename(url) # Extract filename from URL\n", + " with tqdm(total=file_size, unit=\"iB\", unit_scale=True, desc=progress_bar_description) as progress_bar:\n", + " # Open the destination file in binary write mode\n", + " with open(destination, \"wb\") as file:\n", + " # Read the file in chunks and write to destination\n", + " while True:\n", + " chunk = response.read(block_size)\n", + " if not chunk:\n", + " break\n", + " file.write(chunk)\n", + " progress_bar.update(len(chunk)) # Update progress bar\n", + " except urllib.error.HTTPError:\n", + " s = (\n", + " f\"The specified URL ({url}) is incorrect, the internet connection cannot be established,\"\n", + " \"\\nor the requested file is temporarily unavailable.\\nPlease visit the following website\"\n", + " \" for help: https://github.com/rasbt/LLMs-from-scratch/discussions/273\")\n", + " print(s)\n", + "\n", + "\n", + "def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):\n", + " # Initialize parameters dictionary with empty blocks for each layer\n", + " params = {\"blocks\": [{} for _ in range(settings[\"n_layer\"])]}\n", + "\n", + " # Iterate over each variable in the checkpoint\n", + " for name, _ in tf.train.list_variables(ckpt_path):\n", + " # Load the variable and remove singleton dimensions\n", + " variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))\n", + "\n", + " # Process the variable name to extract relevant parts\n", + " variable_name_parts = name.split(\"/\")[1:] # Skip the 'model/' prefix\n", + "\n", + " # Identify the target dictionary for the variable\n", + " target_dict = params\n", + " if variable_name_parts[0].startswith(\"h\"):\n", + " layer_number = int(variable_name_parts[0][1:])\n", + " target_dict = params[\"blocks\"][layer_number]\n", + "\n", + " # Recursively access or create nested dictionaries\n", + " for key in variable_name_parts[1:-1]:\n", + " target_dict = target_dict.setdefault(key, {})\n", + "\n", + " # Assign the variable array to the last key\n", + " last_key = variable_name_parts[-1]\n", + " target_dict[last_key] = variable_array\n", + "\n", + " return params" ] }, {