update gpt-2 code

ansost · Nov 20, 2024 · ca8bb4e · ca8bb4e
1 parent 513840e
commit ca8bb4e
Showing 1 changed file with 114 additions and 51 deletions.
diff --git a/2024/weeks/week07/gpt2.ipynb b/2024/weeks/week07/gpt2.ipynb
@@ -118,7 +118,7 @@
       "source": [
         "GPT_CONFIG_124M = {\n",
         "    \"vocab_size\": 50257,    # Vocabulary size\n",
-        "    \"context_length\": 1024, # Context length\n",
+        "    \"context_length\": 256, # Context length\n",
         "    \"emb_dim\": 768,         # Embedding dimension\n",
         "    \"n_heads\": 12,          # Number of attention heads\n",
         "    \"n_layers\": 12,         # Number of layers\n",
@@ -479,47 +479,7 @@
         "        x = self.drop_shortcut(x)\n",
         "        x = x + shortcut  # Add the original input back\n",
         "\n",
-        "        return x\n",
-        "\n",
-        "class GPTModel(nn.Module):\n",
-        "    def __init__(self, cfg):\n",
-        "        super().__init__()\n",
-        "        self.tok_emb = nn.Embedding(cfg[\"vocab_size\"], cfg[\"emb_dim\"])\n",
-        "        self.pos_emb = nn.Embedding(cfg[\"context_length\"], cfg[\"emb_dim\"])\n",
-        "        self.drop_emb = nn.Dropout(cfg[\"drop_rate\"])\n",
-        "\n",
-        "        self.trf_blocks = nn.Sequential(\n",
-        "            *[TransformerBlock(cfg) for _ in range(cfg[\"n_layers\"])])\n",
-        "\n",
-        "        self.final_norm = LayerNorm(cfg[\"emb_dim\"])\n",
-        "        self.out_head = nn.Linear(\n",
-        "            cfg[\"emb_dim\"], cfg[\"vocab_size\"], bias=False\n",
-        "        )\n",
-        "\n",
-        "    def forward(self, in_idx):\n",
-        "        batch_size, seq_len = in_idx.shape\n",
-        "        tok_embeds = self.tok_emb(in_idx)\n",
-        "        pos_embeds = self.pos_emb(torch.arange(seq_len, device=in_idx.device))\n",
-        "        x = tok_embeds + pos_embeds  # Shape [batch_size, num_tokens, emb_size]\n",
-        "        x = self.drop_emb(x)\n",
-        "        x = self.trf_blocks(x)\n",
-        "        x = self.final_norm(x)\n",
-        "        logits = self.out_head(x)\n",
-        "        return logits\n",
-        "\n",
-        "GPT_CONFIG_124M = {\n",
-        "    \"vocab_size\": 50257,   # Vocabulary size\n",
-        "    \"context_length\": 256, # Shortened context length (orig: 1024)\n",
-        "    \"emb_dim\": 768,        # Embedding dimension\n",
-        "    \"n_heads\": 12,         # Number of attention heads\n",
-        "    \"n_layers\": 12,        # Number of layers\n",
-        "    \"drop_rate\": 0.1,      # Dropout rate\n",
-        "    \"qkv_bias\": False      # Query-key-value bias\n",
-        "}\n",
-        "\n",
-        "torch.manual_seed(123)\n",
-        "model = GPTModel(GPT_CONFIG_124M)\n",
-        "model.eval();  # Disable dropout during inference"
+        "        return x\n"
       ]
     },
     {
@@ -629,6 +589,10 @@
     {
       "cell_type": "code",
       "source": [
+        "torch.manual_seed(123)\n",
+        "model = GPTModel(GPT_CONFIG_124M)\n",
+        "\n",
+        "\n",
         "total_params = sum(p.numel() for p in model.parameters())\n",
         "print(f\"Total number of parameters: {total_params:,}\")"
       ],
@@ -864,7 +828,8 @@
         "    logits = model(inputs)\n",
         "\n",
         "probas = torch.softmax(logits, dim=-1) # Probability of each token in vocabulary\n",
-        "print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)"
+        "print(probas.shape) # Shape: (batch_size, num_tokens, vocab_size)\n",
+        "print(probas)"
       ]
     },
     {
@@ -1708,15 +1673,15 @@
       "source": [
         "# Note:\n",
         "# Uncomment the following code to calculate the execution time\n",
-        "# import time\n",
-        "# start_time = time.time()\n",
+        "import time\n",
+        "start_time = time.time()\n",
         "\n",
         "torch.manual_seed(123)\n",
         "model = GPTModel(GPT_CONFIG_124M)\n",
         "model.to(device)\n",
         "optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)\n",
         "\n",
-        "num_epochs = 10\n",
+        "num_epochs = 25\n",
         "train_losses, val_losses, tokens_seen = train_model_simple(\n",
         "    model, train_loader, val_loader, optimizer, device,\n",
         "    num_epochs=num_epochs, eval_freq=5, eval_iter=5,\n",
@@ -1725,8 +1690,8 @@
         "\n",
         "# Note:\n",
         "# Uncomment the following code to show the execution time\n",
-        "# end_time = time.time()\n",
-        "# execution_time_minutes = (end_time - start_time) / 60\n",
+        "end_time = time.time()\n",
+        "execution_time_minutes = (end_time - start_time) / 60\n",
         "# print(f\"Training completed in {execution_time_minutes:.2f} minutes.\")"
       ]
     },
@@ -1824,7 +1789,6 @@
       },
       "outputs": [],
       "source": [
-        "model.to(\"cpu\")\n",
         "model.eval()\n",
         "\n",
         "tokenizer = tiktoken.get_encoding(\"gpt2\")\n",
@@ -2474,8 +2438,107 @@
       },
       "outputs": [],
       "source": [
-        "# Relative import from the gpt_download.py contained in this folder\n",
-        "from gpt_download import download_and_load_gpt2"
+        "import json\n",
+        "import numpy as np\n",
+        "import tensorflow as tf\n",
+        "from tqdm import tqdm\n",
+        "\n",
+        "\n",
+        "def download_and_load_gpt2(model_size, models_dir):\n",
+        "    # Validate model size\n",
+        "    allowed_sizes = (\"124M\", \"355M\", \"774M\", \"1558M\")\n",
+        "    if model_size not in allowed_sizes:\n",
+        "        raise ValueError(f\"Model size not in {allowed_sizes}\")\n",
+        "\n",
+        "    # Define paths\n",
+        "    model_dir = os.path.join(models_dir, model_size)\n",
+        "    base_url = \"https://openaipublic.blob.core.windows.net/gpt-2/models\"\n",
+        "    filenames = [\n",
+        "        \"checkpoint\", \"encoder.json\", \"hparams.json\",\n",
+        "        \"model.ckpt.data-00000-of-00001\", \"model.ckpt.index\",\n",
+        "        \"model.ckpt.meta\", \"vocab.bpe\"\n",
+        "    ]\n",
+        "\n",
+        "    # Download files\n",
+        "    os.makedirs(model_dir, exist_ok=True)\n",
+        "    for filename in filenames:\n",
+        "        file_url = os.path.join(base_url, model_size, filename)\n",
+        "        file_path = os.path.join(model_dir, filename)\n",
+        "        download_file(file_url, file_path)\n",
+        "\n",
+        "    # Load settings and params\n",
+        "    tf_ckpt_path = tf.train.latest_checkpoint(model_dir)\n",
+        "    settings = json.load(open(os.path.join(model_dir, \"hparams.json\")))\n",
+        "    params = load_gpt2_params_from_tf_ckpt(tf_ckpt_path, settings)\n",
+        "\n",
+        "    return settings, params\n",
+        "\n",
+        "\n",
+        "def download_file(url, destination):\n",
+        "    # Send a GET request to download the file\n",
+        "\n",
+        "    try:\n",
+        "        with urllib.request.urlopen(url) as response:\n",
+        "            # Get the total file size from headers, defaulting to 0 if not present\n",
+        "            file_size = int(response.headers.get(\"Content-Length\", 0))\n",
+        "\n",
+        "            # Check if file exists and has the same size\n",
+        "            if os.path.exists(destination):\n",
+        "                file_size_local = os.path.getsize(destination)\n",
+        "                if file_size == file_size_local:\n",
+        "                    print(f\"File already exists and is up-to-date: {destination}\")\n",
+        "                    return\n",
+        "\n",
+        "            # Define the block size for reading the file\n",
+        "            block_size = 1024  # 1 Kilobyte\n",
+        "\n",
+        "            # Initialize the progress bar with total file size\n",
+        "            progress_bar_description = os.path.basename(url)  # Extract filename from URL\n",
+        "            with tqdm(total=file_size, unit=\"iB\", unit_scale=True, desc=progress_bar_description) as progress_bar:\n",
+        "                # Open the destination file in binary write mode\n",
+        "                with open(destination, \"wb\") as file:\n",
+        "                    # Read the file in chunks and write to destination\n",
+        "                    while True:\n",
+        "                        chunk = response.read(block_size)\n",
+        "                        if not chunk:\n",
+        "                            break\n",
+        "                        file.write(chunk)\n",
+        "                        progress_bar.update(len(chunk))  # Update progress bar\n",
+        "    except urllib.error.HTTPError:\n",
+        "        s = (\n",
+        "            f\"The specified URL ({url}) is incorrect, the internet connection cannot be established,\"\n",
+        "            \"\\nor the requested file is temporarily unavailable.\\nPlease visit the following website\"\n",
+        "            \" for help: https://github.com/rasbt/LLMs-from-scratch/discussions/273\")\n",
+        "        print(s)\n",
+        "\n",
+        "\n",
+        "def load_gpt2_params_from_tf_ckpt(ckpt_path, settings):\n",
+        "    # Initialize parameters dictionary with empty blocks for each layer\n",
+        "    params = {\"blocks\": [{} for _ in range(settings[\"n_layer\"])]}\n",
+        "\n",
+        "    # Iterate over each variable in the checkpoint\n",
+        "    for name, _ in tf.train.list_variables(ckpt_path):\n",
+        "        # Load the variable and remove singleton dimensions\n",
+        "        variable_array = np.squeeze(tf.train.load_variable(ckpt_path, name))\n",
+        "\n",
+        "        # Process the variable name to extract relevant parts\n",
+        "        variable_name_parts = name.split(\"/\")[1:]  # Skip the 'model/' prefix\n",
+        "\n",
+        "        # Identify the target dictionary for the variable\n",
+        "        target_dict = params\n",
+        "        if variable_name_parts[0].startswith(\"h\"):\n",
+        "            layer_number = int(variable_name_parts[0][1:])\n",
+        "            target_dict = params[\"blocks\"][layer_number]\n",
+        "\n",
+        "        # Recursively access or create nested dictionaries\n",
+        "        for key in variable_name_parts[1:-1]:\n",
+        "            target_dict = target_dict.setdefault(key, {})\n",
+        "\n",
+        "        # Assign the variable array to the last key\n",
+        "        last_key = variable_name_parts[-1]\n",
+        "        target_dict[last_key] = variable_array\n",
+        "\n",
+        "    return params"
       ]
     },
     {