From 327738e019174deaac90466f8375ef15a6008a73 Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Sat, 5 Mar 2022 20:14:16 +0900
Subject: [PATCH 01/21] modify spelling

saving and loading a general checkpoint.py  \n intial -> initial
---
 .../recipes/saving_and_loading_a_general_checkpoint.py        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py
index 6e0c490ec2..a31f43970f 100644
--- a/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py
+++ b/recipes_source/recipes/saving_and_loading_a_general_checkpoint.py
@@ -42,7 +42,7 @@
 # -----
 # 
 # 1. Import all necessary libraries for loading our data
-# 2. Define and intialize the neural network
+# 2. Define and initialize the neural network
 # 3. Initialize the optimizer
 # 4. Save the general checkpoint
 # 5. Load the general checkpoint
@@ -60,7 +60,7 @@
 
 
 ######################################################################
-# 2. Define and intialize the neural network
+# 2. Define and initialize the neural network
 # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 # 
 # For sake of example, we will create a neural network for training

From 6e9265001b86dbc0ce90ae5a6e2e4baa1392c3c0 Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Sat, 13 Jul 2024 19:22:33 +0900
Subject: [PATCH 02/21] feat : tutorials

toctree : Image and Video

Pretraining Vgg from scratch
---
 .../Pretraining_Vgg_from_scratch.py           | 643 ++++++++++++++++++
 index.rst                                     |   8 +-
 2 files changed, 650 insertions(+), 1 deletion(-)
 create mode 100644 beginner_source/Pretraining_Vgg_from_scratch.py

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
new file mode 100644
index 0000000000..1ddcd745e2
--- /dev/null
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -0,0 +1,643 @@
+"""
+In this tutorial, we will embark on an exciting journey to build and
+train a VGG network from scratch using Python and popular deep learning
+libraries such as PyTorch. We will dive into the details of the VGG
+architecture, understanding its components and the rationale behind its
+design.
+
+Our tutorial is designed for both beginners who are new to deep learning
+and seasoned practitioners looking to deepen their understanding of CNN
+architectures.
+
+"""
+
+
+######################################################################
+# Author : `WoongJoon Choi <https://github.com/woongjoonchoi>`__
+# 
+
+import torch.optim as optim
+import albumentations as A
+import numpy as np
+
+
+from torchvision.datasets import CIFAR100,CIFAR10,MNIST,ImageNet
+import os
+from PIL import Image
+
+
+
+
+
+######################################################################
+# I recommend using GPU for this tutorial.
+# 
+
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+
+
+######################################################################
+# Worth point of this tutorial
+# ----------------------------
+# 
+
+
+######################################################################
+# -  We train the model from scratch using only the configuration
+#    presented in the paper.
+# 
+#    -  we do not use future method, like BatchNormalization,Adam , He
+#       initialization.
+# 
+# -  You can apply to ImageNet Data.
+# 
+#    -  If you can download the ImageNet Data(140GB), you can apply this
+#       tutorial to reproduce Original VGG.
+# 
+# -  You can learn VGG within the training time suggested in the paper.
+# 
+
+
+######################################################################
+# Why Vgg is so popluar ?
+# -----------------------
+# 
+
+
+######################################################################
+# VGG became a model that attracted attention because it succeeded in
+# building deeper layers and dramatically shortening the training time
+# compared to alexNet, which was the sota model at the time.:
+# 
+
+
+######################################################################
+# VGG Configuration
+# -----------------
+# 
+
+
+######################################################################
+# We define some configurations suggested in VGG paper . Details about
+# this configuration will be explained below section.
+# 
+
+DatasetName = 'Cifar' # Cifar ,Cifar10, Mnist , ImageNet
+
+## model configuration
+
+num_classes =   100
+# CalTech 257 Cifar 100  Cifar10 10 ,Mnist 10 ImageNet 1000
+model_version = None ## you must configure it.
+
+## data configuration
+
+train_min = 256
+train_max = None
+test_min = 256
+test_max = 256
+
+## train configuration
+
+batch_size = 32
+lr = 1e-2
+momentum = 0.9
+weight_decay  = 5e-4
+lr_factor = 0.1
+epoch = 10
+clip= None # model D grad clip 0.7
+
+
+update_count = int(256/batch_size)
+accum_step = int(256/batch_size)
+eval_step =26 * accum_step  ## CalTech 5 Cifar 5 Mnist 6 , Cifar10 5 ImageNet  26
+
+
+## model configuration
+xavier_count= 4
+
+last_xavier = -8  ##
+
+except_xavier = None
+
+model_layers =None
+
+
+
+######################################################################
+# | If your GPU memory is 24GB ,The maximum batch size is 128. But, if you
+#   use Colab , I recommend using 32 .
+# | You can modify the batch size according to your preference.
+# 
+
+
+######################################################################
+# Define dataset
+# --------------
+# 
+
+
+######################################################################
+# We use ``CIFAR100`` Dataset in this tutorial. In Vgg paper , the authors
+# scales image istropically . Then , they apply
+# Normalization,RandomCrop,HorizontalFlip . So , we need to override
+# CIFAR100 class to apply preprocessing.
+# 
+
+class Custom_Cifar(CIFAR100) :
+    def __init__(self,root,transform = None,multi=False,s_max=None,s_min=256,download=False,val=False,train=True):
+
+        self.multi = multi
+        self.s_max = 512
+        self.s_min= 256
+        if multi :
+            self.S = np.random.randint(low=self.s_min,high=self.s_max)
+        else :
+            self.S = s_min
+            transform = A.Compose(
+                    [
+                        A.Normalize(mean =(0.5071, 0.4867, 0.4408) , std = (0.2675, 0.2565, 0.2761)),
+                        A.SmallestMaxSize(max_size=self.S),
+                        A.RandomCrop(height =224,width=224),
+                        A.HorizontalFlip(),
+                        # A.RGBShift()
+                    ]
+
+            )
+        super().__init__(root,transform=transform,train=train,download=download)
+        self.val =train
+        self.multi = multi
+    def __getitem__(self, index: int) :
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        img, target = self.data[index], self.targets[index]
+
+        # doing this so that it is consistent with all other datasets
+        # to return a PIL Image
+
+        img = Image.fromarray(img)
+
+        if img.mode == 'L' : img = img.convert('RGB')
+        img=np.array(img,dtype=np.float32)
+
+
+        if self.transform is not None:
+            img = self.transform(image=img)
+            if len(img['image'].shape) == 3 and self.val==False :
+                img = A.RGBShift()(image=img['image'])
+            img = img['image']
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+
+        img=img.transpose((2,0,1))
+
+        return img, target
+
+
+######################################################################
+# Define Model
+# ------------
+# 
+
+
+######################################################################
+# | In Vgg paper, they do experiment over 6 models. model A is 11 layers,
+#   model B is 13 layers, model C is 16 layers , model D is 16 laeyrs and
+#   model D is 19 layers . you can train all version of models to
+#   reproduce VGG .
+# | ``Config_Channels`` means output channels and ``Config_kernels`` means
+#   kerenl size.
+# 
+
+import torch
+from torch import nn
+
+
+Config_channels = {
+"A" : [64,"M" , 128,  "M"  , 256,256,"M" ,512,512 ,"M" , 512,512,"M"] ,
+"A_lrn" : [64,"LRN","M" , 128,  "M"  , 256,256,"M" ,512,512 ,"M" , 512,512,"M"] ,
+"B" :[64,64,"M" , 128,128,  "M"  , 256,256,"M" ,512,512 ,"M" , 512,512,"M"]  ,
+"C" : [64,64,"M" , 128,128,  "M"  , 256,256,256,"M" ,512,512 ,512,"M" , 512,512,512,"M"] ,
+"D" :[64,64,"M" , 128,128,  "M"  , 256,256,256,"M" ,512,512 ,512,"M" , 512,512,512,"M"] ,
+"E" :[64,64,"M" , 128,128,  "M"  , 256,256,256,256,"M" ,512,512 ,512,512,"M" , 512,512,512,512,"M"]         ,
+
+}
+
+
+
+Config_kernel = {
+"A" : [3,2 , 3,  2  , 3,3,2 ,3,3 ,2 , 3,3,2] ,
+"A_lrn" : [3,2,2 , 3,  2  , 3,3,2 ,3,3 ,2 , 3,3,2] ,
+"B" :[3,3,2 , 3,3,  2  , 3,3,2 ,3,3 ,2 , 3,3,2]  ,
+"C" : [3,3,2 , 3,3,  2  , 3,3,1,2 ,3,3 ,1,2 , 3,3,1,2] ,
+"D" :[3,3,2 , 3,3,  2  , 3,3,3,2 ,3,3 ,3,2 , 3,3,3,2] ,
+"E" :[3,3,2 , 3,3,  2  , 3,3,3,3,2 ,3,3 ,3,3,2 , 3,3,3,3,2]         ,
+
+}
+
+
+######################################################################
+# We define model class that generate model in choice of 6 versions.
+# 
+
+def make_feature_extractor(cfg_c,cfg_k):
+    feature_extract = []
+    in_channels = 3
+    i = 1
+    for  out_channels , kernel in zip(cfg_c,cfg_k) :
+        # print(f"{i} th layer {out_channels} processing")
+        if out_channels == "M" :
+            feature_extract += [nn.MaxPool2d(kernel,2) ]
+        elif out_channels == "LRN":
+            feature_extract += [nn.LocalResponseNorm(5,k=2) , nn.ReLU()]
+        elif out_channels == 1:
+            feature_extract+= [nn.Conv2d(in_channels,out_channels,kernel,stride = 1) , nn.ReLU()]
+        else :
+            feature_extract+= [nn.Conv2d(in_channels,out_channels,kernel,stride = 1 , padding = 1) , nn.ReLU()]
+
+        if isinstance(out_channels,int) :   in_channels = out_channels
+        i+=1
+    return nn.Sequential(*feature_extract)
+
+
+class Model_vgg(nn.Module) :
+    def __init__(self,version , num_classes):
+        conv_5_out_w ,conv_5_out_h = 7,7
+        conv_5_out_dim =512
+        conv_1_by_1_1_outchannel = 4096
+        conv_1_by_1_2_outchannel = 4096
+        self.num_classes = num_classes
+        self.linear_out = 4096
+        self.xavier_count = xavier_count
+        self.last_xavier= last_xavier  ## if >0 , initialize last 3 fully connected noraml distribution
+        # conv_1_by_1_3_outchannel = num_classes
+        self.except_xavier  = except_xavier
+
+        super().__init__()
+        self.feature_extractor = make_feature_extractor(Config_channels[version] , Config_kernel[version])
+        self.avgpool = nn.AdaptiveAvgPool2d((1,1))
+        self.output_layer = nn.Sequential(
+                             nn.Conv2d(conv_5_out_dim  ,conv_1_by_1_1_outchannel ,7) ,
+                             nn.ReLU(),
+                             nn.Dropout2d(),
+                             nn.Conv2d(conv_1_by_1_1_outchannel ,conv_1_by_1_2_outchannel,1 ) ,
+                             nn.ReLU(),
+                             nn.Dropout2d(),
+                             nn.Conv2d(conv_1_by_1_2_outchannel ,num_classes,1 )
+                             )
+
+
+        print('weight initialize')
+        self.apply(self._init_weights)
+        print('weight intialize end')
+    def forward(self,x):
+        x = self.feature_extractor(x)
+        # x= self.avgpool(x)  ##  If Linear is output, use this
+        # x= torch.flatten(x,start_dim = 1) ## If Linear is output, use this
+        x = self.output_layer(x)
+        x= self.avgpool(x)
+        x= torch.flatten(x,start_dim = 1)
+        return x
+
+
+    @torch.no_grad()
+    def _init_weights(self,m):
+
+        # print(m)
+        if isinstance(m,nn.Conv2d):
+            print('-------------')
+            print(m.kernel_size)
+            print(m.out_channels)
+            # if (m.out_channels == self.num_classes or m.out_channels == self.linear_out) and self.last_xavier>0 :
+            if self.last_xavier>0 and (self.except_xavier is  None or self.last_xavier!=self.except_xavier):
+                print('xavier')
+                # self.last_xavier-=1
+                nn.init.xavier_uniform_(m.weight)
+            elif self.xavier_count >0 :
+                print('xavier')
+                nn.init.xavier_uniform_(m.weight)
+                self.xavier_count-=1
+            else :
+                std = 0.1
+                print(f'normal  std : {std}')
+
+                torch.nn.init.normal_(m.weight,std=std)
+                # if (m.out_channels == self.num_classes or m.out_channels == self.linear_out) :
+                #     self.last_xavier+=10
+            self.last_xavier +=1
+            if m.bias is not None :
+                print('bias zero init')
+                nn.init.zeros_(m.bias)
+        elif isinstance(m, nn.Linear):
+            if self.last_xavier >0 :
+                nn.init.xavier_uniform_(m.weight)
+                self.last_xavier-=1
+            else :
+                torch.nn.init.normal_(m.weight,std=std)
+                self.last_xavier+=1
+                print(f'last xavier increase to {self.last_xavier}')
+            nn.init.constant_(m.bias, 0)
+
+
+######################################################################
+# Parameter Initialization
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# 
+
+
+######################################################################
+# When training Vgg , the authors first train model A , then initialized
+# the weights of other models with the weights of model A. Waiting for
+# Model A to be trained takes a long time . The authors mention how to
+# train with xavier initialization rather than initializing with the
+# weights of model A. But, they do not mention how to initialize .
+# 
+# | To Reproduce Vgg , we use xavier initialization method to initialize
+#   weights. We apply initialization to few first layes and last layers.
+#   Then , we apply random initialization to other layers.
+# | **we must fix stdandrad deviation to 0.1**. If standard deviation is
+#   larger than 0.1, the weight get NAN values. For stability, we use 0.1
+#   for standard deviation.
+# | The ``front_xavier`` means how many layers we initialize with xavier
+#   initialization in front of layers and The ``last_xavier`` means how
+#   many layers we initializae with xavier initialization in last of
+#   layers.
+# 
+# In My experiment, we can use ``front_xavier`` = 4 , ``last_xavier``\ =5
+# in model A, ``front_xavier`` =4 ``last_xavier``\ =7 in model B,C , D and
+# ``front_xavier`` =5\ ``last_xavier``\ = 9 in model E . These values work
+# fine.
+# 
+
+
+######################################################################
+# Training Model
+# --------------
+# 
+
+
+######################################################################
+# We will define top-k error.
+# 
+
+def accuracy(output, target, topk=(1,)):
+    """Computes the precision@k for the specified values of k"""
+    maxk = max(topk)
+    batch_size = target.size(0)
+
+    _, pred = output.topk(maxk, 1, True, True)
+    pred = pred.t()
+    correct = pred.eq(target.view(1, -1).expand_as(pred))
+
+    res = []
+    for k in topk:
+        # print(f'top {k}')
+        correct_k = correct[:k].reshape(-1).float().sum(0,keepdim=True)
+        # res.append(correct_k.mul_(100.0 / batch_size))
+        res.append(correct_k)
+    return res
+
+
+######################################################################
+# we initiate model and loss function and optimizer and schedulers. In
+# vgg, they use softmax output ,Momentum Optimizer , and Scheduling based
+# on accuarcy.
+# 
+
+model = Model_vgg(model_version,num_classes)
+criterion = nn.CrossEntropyLoss()
+
+optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay,nesterov=nestrov,momentum=momentum)
+scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max',patience=10,threshold=1e-3,eps = 1e-5)
+
+
+
+######################################################################
+# we use ``CIFAR100`` .
+# 
+
+if DatasetName == 'Cifar' :
+    train_data = Custom_Cifar(root=os.getcwd(),download=True)
+    val_data  = Custom_Cifar(root=os.getcwd(),train=False,download=True)
+    val_data.val= True
+    val_data.s_min = test_min
+    val_data.transform=    A.Compose(
+                    [
+                        A.Normalize(mean =(0.5071, 0.4867, 0.4408) , std = (0.2675, 0.2565, 0.2761)),
+                        A.SmallestMaxSize(max_size=val_data.S),
+                        A.CenterCrop(height =224,width=224),
+                        # A.HorizontalFlip(),
+                        # A.RGBShift()
+                    ]
+
+                )
+
+train_loader = torch.utils.data.DataLoader(train_data,batch_size= batch_size,shuffle = True , num_workers=4,pin_memory = True,prefetch_factor = 2,drop_last = True)
+val_loader = torch.utils.data.DataLoader(val_data,batch_size= batch_size,shuffle = True , num_workers=4,pin_memory = True,prefetch_factor = 2,drop_last = True)
+
+
+######################################################################
+# we set grad_clip to 1.0 for prevent gradient exploding.
+# 
+
+model = model.to(device)
+
+grad_clip = 1.0
+
+for e in range(epoch-resume_epoch) :
+    print(f'Training Epoch : {e}')
+    total_loss = 0
+    val_iter = iter(val_loader)
+    train_acc=[0,0]
+    train_num = 0
+
+    total_acc = [0,0]
+    count= 0
+    for i , data in tqdm(enumerate(train_loader)) :
+
+
+        model.train()
+        img,label= data
+        img,label =img.to(device, non_blocking=True) ,label.to(device, non_blocking=True)
+
+        output = model(img)
+
+        loss = criterion(output,label) /accum_step
+
+        temp_output ,temp_label = output.detach().to('cpu') , label.detach().to('cpu')
+        temp_acc = accuracy(temp_output,temp_label,(1,5))
+        train_acc=[train_acc[0]+temp_acc[0] , train_acc[1]+temp_acc[1]]
+        train_num+=batch_size
+        temp_output,temp_label,temp_acc = None,None,None
+
+        loss.backward()
+        total_loss += loss.detach().to('cpu')
+        img,label=None,None
+        torch.cuda.empty_cache()
+        if i> 0 and i%update_count == 0 :
+            print(f'Training steps : {i}  parameter update loss :{total_loss} ')
+            if grad_clip is not None:
+                # print(f'Training steps : {i}  parameter grad clip to {grad_clip}')
+                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+            optimizer.step()
+            optimizer.zero_grad(set_to_none=True)
+
+            if total_loss < 7.0 :
+                # print(f"train loss {total_loss}less than 7.0  ,set grad clip to {clip}")
+                grad_clip = clip
+            if i % eval_step != 0 :
+                total_loss = 0
+
+            output,loss = None,None
+            torch.cuda.empty_cache()
+        if i>0 and i % eval_step == 0 :
+
+            print(f'train losss :{total_loss}')
+            temp_loss = total_loss
+            total_loss= 0
+
+            val_loss = 0
+            torch.cuda.empty_cache()
+
+            for j   in tqdm(range(update_count)) :
+                loss = None
+                print(f'Evaluation Steps Start')
+                try :
+                    img,label = next(val_iter)
+                except StopIteration :
+                    val_iter= iter(val_loader)
+                    img,label = next(val_iter)
+                with torch.no_grad():
+                    model.eval()
+
+                    img , label = img.to(device, non_blocking=True) , label.to(device, non_blocking=True)
+                    output = model(img)
+                    temp_output ,temp_label = output.detach().to('cpu') , label.detach().to('cpu')
+                    temp_acc = accuracy(temp_output,temp_label,(1,5))
+                    total_acc=[total_acc[0]+temp_acc[0] , total_acc[1]+temp_acc[1]]
+                    count+=batch_size
+
+                    loss = criterion(output,label)/accum_step
+                    val_loss += loss.detach().to('cpu')
+                    # loss.backward()
+                    torch.cuda.empty_cache()
+
+
+                    img,label,output ,loss= None,None,None,None
+
+
+
+                torch.cuda.empty_cache()
+
+            if abs(val_loss-temp_loss) > 0.03 :
+                grad_clip=clip
+                # print(f"val_loss {val_loss} - train_loss {temp_loss} = {abs(val_loss-temp_loss)} > 0.3")
+                # print(f"set grad clip to {grad_clip}")
+
+                best_val_loss = val_loss
+
+            val_loss = None
+        img,label,output = None,None,None
+
+
+
+    print(f'top 1 val acc : {total_acc[0]}  top 5 val acc : {total_acc[1]}')
+    print(f'val_size :{count}')
+    top_1_acc ,top_5_acc   = 100*total_acc[0]/count, 100*total_acc[1]/count
+    print(f'top 1 val acc  %: {top_1_acc}')
+    print(f'top 5 val acc  %: {top_5_acc}')
+
+
+    print(f'top 1 train acc : {train_acc[0]}  top 5 train acc : {train_acc[1]}')
+    print(f'train_size :{train_num}')
+    top_1_train ,top_5_train   = 100*train_acc[0]/train_num, 100*train_acc[1]/train_num
+    print(f'top 1 train acc  %: {top_1_train}')
+    print(f'top 5 train acc  %: {top_5_train}')
+
+
+    scheduler.step(top_5_acc)
+
+
+
+######################################################################
+# (Optional) ImageNet
+# -------------------
+# 
+
+class Cusotm_ImageNet(ImageNet) :
+    def __init__(self,root,transform = None,multi=False,s_max=None,s_min=256,split=None,val=False):
+
+        self.multi = multi
+        self.s_max = 512
+        self.s_min= 256
+        if multi :
+            self.S = np.random.randint(low=self.s_min,high=self.s_max)
+        else :
+            self.S = s_min
+            transform = A.Compose(
+                    [
+                        A.Normalize(),
+                        A.SmallestMaxSize(max_size=self.S),
+                        A.RandomCrop(height =224,width=224),
+                        A.HorizontalFlip(),
+                        # A.RGBShift()
+                    ]
+
+            )
+        super().__init__(root,transform=transform,split=split)
+        self.val =val
+        self.multi = multi
+    def __getitem__(self, index: int) :
+        """
+        Args:
+            index (int): Index
+
+        Returns:
+            tuple: (image, target) where target is index of the target class.
+        """
+        path, target = self.samples[index]
+        img = self.loader(path)
+        # doing this so that it is consistent with all other datasets
+        # to return a PIL Image
+        img=np.array(img)
+        img = Image.fromarray(img)
+
+        if img.mode == 'L' : img = img.convert('RGB')
+        img=np.array(img,dtype=np.float32)
+
+        # print(self.transform)
+
+        if self.transform is not None:
+            img = self.transform(image=img)
+            if len(img['image'].shape) == 3 and self.val==False :
+                img = A.RGBShift()(image=img['image'])
+            img = img['image']
+
+        if self.target_transform is not None:
+            target = self.target_transform(target)
+        # print(img)
+        img=img.transpose((2,0,1))
+
+        return img, target
+
+if DatasetName == 'ImageNet' :
+    train_data= Cusotm_ImageNet(root='ImageNet',split='train')
+    val_data= Cusotm_ImageNet('ImageNet',split='val',val=True)
+    val_data.val= True
+    val_data.s_min = test_min
+    val_data.transform=    A.Compose(
+                    [
+                        A.Normalize(),
+                        A.SmallestMaxSize(max_size=val_data.S),
+                        A.CenterCrop(height =224,width=224),
+                        # A.HorizontalFlip(),
+                        # A.RGBShift()
+                    ]
+
+                )
diff --git a/index.rst b/index.rst
index a231be4dc2..6fded1fbea 100644
--- a/index.rst
+++ b/index.rst
@@ -151,6 +151,12 @@ Welcome to PyTorch Tutorials
    :link: advanced/usb_semisup_learn.html
    :tags: Image/Video
 
+.. customcarditem::
+   :header: Pretraining VGG from scratch 
+   :card_description: Train VGG from scratch
+   :link: beginner/Pretrainig_VGG_from_scratch.html
+   :tags: Image/Video
+
 .. Audio
 
 .. customcarditem::
@@ -976,7 +982,7 @@ Additional Resources
    intermediate/spatial_transformer_tutorial
    beginner/vt_tutorial
    intermediate/tiatoolbox_tutorial
-
+   beginner/Pretrainig_VGG_from_scratch
 .. toctree::
    :maxdepth: 2
    :includehidden:

From c401bcbe7b8ae010ff8096c782e942d22d19a582 Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Tue, 16 Jul 2024 13:21:34 +0900
Subject: [PATCH 03/21] modify : tutorial

Follow Pytorch Tutorial convetion

Modify I -> We
---
 .../Pretraining_Vgg_from_scratch.py           | 33 ++++++++++++-------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 1ddcd745e2..e86b9f025c 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -1,20 +1,29 @@
 """
-In this tutorial, we will embark on an exciting journey to build and
-train a VGG network from scratch using Python and popular deep learning
-libraries such as PyTorch. We will dive into the details of the VGG
-architecture, understanding its components and the rationale behind its
-design.
+Pretraining VGG from scratch 
+============================
+
+
+**Author:** `WoongJoon Choi <https://github.com/woongjoonchoi>`_
+
+
 
-Our tutorial is designed for both beginners who are new to deep learning
-and seasoned practitioners looking to deepen their understanding of CNN
-architectures.
 
 """
 
 
 ######################################################################
-# Author : `WoongJoon Choi <https://github.com/woongjoonchoi>`__
-# 
+# In this tutorial, we will embark on an exciting journey to build and
+# train a VGG network from scratch using Python and popular deep learning
+# libraries such as PyTorch. We will dive into the details of the VGG
+# architecture, understanding its components and the rationale behind its
+# design.
+#
+# Our tutorial is designed for both beginners who are new to deep learning
+# and seasoned practitioners looking to deepen their understanding of CNN
+# architectures.
+
+
+
 
 import torch.optim as optim
 import albumentations as A
@@ -30,7 +39,7 @@
 
 
 ######################################################################
-# I recommend using GPU for this tutorial.
+# We recommend using GPU for this tutorial.
 # 
 
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
@@ -126,7 +135,7 @@
 
 ######################################################################
 # | If your GPU memory is 24GB ,The maximum batch size is 128. But, if you
-#   use Colab , I recommend using 32 .
+#   use Colab , We recommend using 32 .
 # | You can modify the batch size according to your preference.
 # 
 

From 32ed4350d937649ff89df00c1600a92318564fbe Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Tue, 16 Jul 2024 14:41:34 +0900
Subject: [PATCH 04/21] modify : VGG training from scratch

Add More things to try  , Conclusion , Further Reading
---
 .../Pretraining_Vgg_from_scratch.py           | 38 ++++++++++++++-----
 1 file changed, 29 insertions(+), 9 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index e86b9f025c..46db2825b5 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -5,22 +5,22 @@
 
 **Author:** `WoongJoon Choi <https://github.com/woongjoonchoi>`_
 
+In this tutorial, we will embark on an exciting journey to build and
+train a VGG network from scratch using Python and popular deep learning
+libraries such as PyTorch. We will dive into the details of the VGG
+architecture, understanding its components and the rationale behind its
+design.
 
+Our tutorial is designed for both beginners who are new to deep learning
+and seasoned practitioners looking to deepen their understanding of CNN
+architectures.
 
 
 """
 
 
 ######################################################################
-# In this tutorial, we will embark on an exciting journey to build and
-# train a VGG network from scratch using Python and popular deep learning
-# libraries such as PyTorch. We will dive into the details of the VGG
-# architecture, understanding its components and the rationale behind its
-# design.
-#
-# Our tutorial is designed for both beginners who are new to deep learning
-# and seasoned practitioners looking to deepen their understanding of CNN
-# architectures.
+
 
 
 
@@ -650,3 +650,23 @@ def __getitem__(self, index: int) :
                     ]
 
                 )
+
+######################################################################
+# Conculsion
+# ----------
+# We have seen how pretraining VGG from scratch . This Tutorial will be helpful to reproduce another Foundation Model .
+
+######################################################################
+# More things to try
+# ------------------
+# - Trying On ImageNet
+# - Try All version of Model
+# - Try All evaluation method in VGG paper
+
+
+######################################################################
+# Further Reading
+# ---------------
+
+# - `VGG training using python script <https://github.com/woongjoonchoi/DeepLearningPaper-Reproducing/tree/master/Vgg>`__
+# - `VGG paper <https://arxiv.org/abs/1409.1556>`__
\ No newline at end of file

From f8dbb6e343e8d71f489f3b26a5ac4d35f72d7cbf Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Wed, 24 Jul 2024 14:19:18 +0900
Subject: [PATCH 05/21] modify : pyspellchecker

---
 .../Pretraining_Vgg_from_scratch.py           | 77 ++++++++-----------
 1 file changed, 30 insertions(+), 47 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 46db2825b5..3d387a484b 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -1,5 +1,5 @@
 """
-Pretraining VGG from scratch 
+``Pretraining`` VGG from scratch 
 ============================
 
 
@@ -55,7 +55,7 @@
 # -  We train the model from scratch using only the configuration
 #    presented in the paper.
 # 
-#    -  we do not use future method, like BatchNormalization,Adam , He
+#    -  we do not use future method, like Batch normalization,Adam , He
 #       initialization.
 # 
 # -  You can apply to ImageNet Data.
@@ -68,7 +68,7 @@
 
 
 ######################################################################
-# Why Vgg is so popluar ?
+# Why VGG is so popular ?
 # -----------------------
 # 
 
@@ -76,7 +76,7 @@
 ######################################################################
 # VGG became a model that attracted attention because it succeeded in
 # building deeper layers and dramatically shortening the training time
-# compared to alexNet, which was the sota model at the time.:
+# compared to alexnet, which was the SOTA model at the time.:
 # 
 
 
@@ -91,12 +91,12 @@
 # this configuration will be explained below section.
 # 
 
-DatasetName = 'Cifar' # Cifar ,Cifar10, Mnist , ImageNet
+DatasetName = 'Cifar' # CIFAR ,CIFAR10, MNIST , ImageNet
 
 ## model configuration
 
 num_classes =   100
-# CalTech 257 Cifar 100  Cifar10 10 ,Mnist 10 ImageNet 1000
+# Caltech 257 CIFAR 100  CIFAR10 10 ,MNIST 10 ImageNet 1000
 model_version = None ## you must configure it.
 
 ## data configuration
@@ -119,7 +119,7 @@
 
 update_count = int(256/batch_size)
 accum_step = int(256/batch_size)
-eval_step =26 * accum_step  ## CalTech 5 Cifar 5 Mnist 6 , Cifar10 5 ImageNet  26
+eval_step =26 * accum_step  ## Caltech 5 CIFAR 5 MNIST 6 , CIFAR10 5 ImageNet  26
 
 
 ## model configuration
@@ -147,9 +147,9 @@
 
 
 ######################################################################
-# We use ``CIFAR100`` Dataset in this tutorial. In Vgg paper , the authors
-# scales image istropically . Then , they apply
-# Normalization,RandomCrop,HorizontalFlip . So , we need to override
+# We use ``CIFAR100`` Dataset in this tutorial. In VGG paper , the authors
+# scales image isotropically . Then , they apply
+# Normalization,``RandomCrop``,``HorizontalFlip`` . So , we need to override
 # CIFAR100 class to apply preprocessing.
 # 
 
@@ -168,8 +168,7 @@ def __init__(self,root,transform = None,multi=False,s_max=None,s_min=256,downloa
                         A.Normalize(mean =(0.5071, 0.4867, 0.4408) , std = (0.2675, 0.2565, 0.2761)),
                         A.SmallestMaxSize(max_size=self.S),
                         A.RandomCrop(height =224,width=224),
-                        A.HorizontalFlip(),
-                        # A.RGBShift()
+                        A.HorizontalFlip()
                     ]
 
             )
@@ -216,12 +215,12 @@ def __getitem__(self, index: int) :
 
 
 ######################################################################
-# | In Vgg paper, they do experiment over 6 models. model A is 11 layers,
-#   model B is 13 layers, model C is 16 layers , model D is 16 laeyrs and
+# | In VGG paper, they do experiment over 6 models. model A is 11 layers,
+#   model B is 13 layers, model C is 16 layers , model D is 16 layers and
 #   model D is 19 layers . you can train all version of models to
 #   reproduce VGG .
 # | ``Config_Channels`` means output channels and ``Config_kernels`` means
-#   kerenl size.
+#   kernel size.
 # 
 
 import torch
@@ -284,8 +283,7 @@ def __init__(self,version , num_classes):
         self.num_classes = num_classes
         self.linear_out = 4096
         self.xavier_count = xavier_count
-        self.last_xavier= last_xavier  ## if >0 , initialize last 3 fully connected noraml distribution
-        # conv_1_by_1_3_outchannel = num_classes
+        self.last_xavier= last_xavier  ## if >0 , initialize last 3 fully connected normal distribution
         self.except_xavier  = except_xavier
 
         super().__init__()
@@ -307,8 +305,6 @@ def __init__(self,version , num_classes):
         print('weight intialize end')
     def forward(self,x):
         x = self.feature_extractor(x)
-        # x= self.avgpool(x)  ##  If Linear is output, use this
-        # x= torch.flatten(x,start_dim = 1) ## If Linear is output, use this
         x = self.output_layer(x)
         x= self.avgpool(x)
         x= torch.flatten(x,start_dim = 1)
@@ -318,15 +314,12 @@ def forward(self,x):
     @torch.no_grad()
     def _init_weights(self,m):
 
-        # print(m)
         if isinstance(m,nn.Conv2d):
             print('-------------')
             print(m.kernel_size)
             print(m.out_channels)
-            # if (m.out_channels == self.num_classes or m.out_channels == self.linear_out) and self.last_xavier>0 :
             if self.last_xavier>0 and (self.except_xavier is  None or self.last_xavier!=self.except_xavier):
                 print('xavier')
-                # self.last_xavier-=1
                 nn.init.xavier_uniform_(m.weight)
             elif self.xavier_count >0 :
                 print('xavier')
@@ -335,10 +328,8 @@ def _init_weights(self,m):
             else :
                 std = 0.1
                 print(f'normal  std : {std}')
-
                 torch.nn.init.normal_(m.weight,std=std)
-                # if (m.out_channels == self.num_classes or m.out_channels == self.linear_out) :
-                #     self.last_xavier+=10
+
             self.last_xavier +=1
             if m.bias is not None :
                 print('bias zero init')
@@ -361,21 +352,21 @@ def _init_weights(self,m):
 
 
 ######################################################################
-# When training Vgg , the authors first train model A , then initialized
+# When training VGG , the authors first train model A , then initialized
 # the weights of other models with the weights of model A. Waiting for
 # Model A to be trained takes a long time . The authors mention how to
-# train with xavier initialization rather than initializing with the
+# train with ``xavier`` initialization rather than initializing with the
 # weights of model A. But, they do not mention how to initialize .
 # 
-# | To Reproduce Vgg , we use xavier initialization method to initialize
-#   weights. We apply initialization to few first layes and last layers.
+# | To Reproduce VGG , we use ``xavier`` initialization method to initialize
+#   weights. We apply initialization to few first layers and last layers.
 #   Then , we apply random initialization to other layers.
-# | **we must fix stdandrad deviation to 0.1**. If standard deviation is
+# | **we must fix standard deviation to 0.1**. If standard deviation is
 #   larger than 0.1, the weight get NAN values. For stability, we use 0.1
 #   for standard deviation.
-# | The ``front_xavier`` means how many layers we initialize with xavier
+# | The ``front_xavier`` means how many layers we initialize with ``xavier``
 #   initialization in front of layers and The ``last_xavier`` means how
-#   many layers we initializae with xavier initialization in last of
+#   many layers we initialize with ``xavier`` initialization in last of
 #   layers.
 # 
 # In My experiment, we can use ``front_xavier`` = 4 , ``last_xavier``\ =5
@@ -406,17 +397,15 @@ def accuracy(output, target, topk=(1,)):
 
     res = []
     for k in topk:
-        # print(f'top {k}')
         correct_k = correct[:k].reshape(-1).float().sum(0,keepdim=True)
-        # res.append(correct_k.mul_(100.0 / batch_size))
         res.append(correct_k)
     return res
 
 
 ######################################################################
 # we initiate model and loss function and optimizer and schedulers. In
-# vgg, they use softmax output ,Momentum Optimizer , and Scheduling based
-# on accuarcy.
+# VGG, they use softmax output ,Momentum Optimizer , and Scheduling based
+# on accuracy.
 # 
 
 model = Model_vgg(model_version,num_classes)
@@ -440,9 +429,7 @@ def accuracy(output, target, topk=(1,)):
                     [
                         A.Normalize(mean =(0.5071, 0.4867, 0.4408) , std = (0.2675, 0.2565, 0.2761)),
                         A.SmallestMaxSize(max_size=val_data.S),
-                        A.CenterCrop(height =224,width=224),
-                        # A.HorizontalFlip(),
-                        # A.RGBShift()
+                        A.CenterCrop(height =224,width=224)
                     ]
 
                 )
@@ -492,7 +479,6 @@ def accuracy(output, target, topk=(1,)):
         if i> 0 and i%update_count == 0 :
             print(f'Training steps : {i}  parameter update loss :{total_loss} ')
             if grad_clip is not None:
-                # print(f'Training steps : {i}  parameter grad clip to {grad_clip}')
                 torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
             optimizer.step()
             optimizer.zero_grad(set_to_none=True)
@@ -594,8 +580,7 @@ def __init__(self,root,transform = None,multi=False,s_max=None,s_min=256,split=N
                         A.Normalize(),
                         A.SmallestMaxSize(max_size=self.S),
                         A.RandomCrop(height =224,width=224),
-                        A.HorizontalFlip(),
-                        # A.RGBShift()
+                        A.HorizontalFlip()
                     ]
 
             )
@@ -644,9 +629,7 @@ def __getitem__(self, index: int) :
                     [
                         A.Normalize(),
                         A.SmallestMaxSize(max_size=val_data.S),
-                        A.CenterCrop(height =224,width=224),
-                        # A.HorizontalFlip(),
-                        # A.RGBShift()
+                        A.CenterCrop(height =224,width=224)
                     ]
 
                 )
@@ -654,7 +637,7 @@ def __getitem__(self, index: int) :
 ######################################################################
 # Conculsion
 # ----------
-# We have seen how pretraining VGG from scratch . This Tutorial will be helpful to reproduce another Foundation Model .
+# We have seen how ``pretraining`` VGG from scratch . This Tutorial will be helpful to reproduce another Foundation Model .
 
 ######################################################################
 # More things to try
@@ -668,5 +651,5 @@ def __getitem__(self, index: int) :
 # Further Reading
 # ---------------
 
-# - `VGG training using python script <https://github.com/woongjoonchoi/DeepLearningPaper-Reproducing/tree/master/Vgg>`__
+# - `VGG training using python script <https://github.com/woongjoonchoi/DeepLearningPaper-Reproducing/tree/master/VGG>`__
 # - `VGG paper <https://arxiv.org/abs/1409.1556>`__
\ No newline at end of file

From b2cd7bf189fa72a43caa546937d57164adaaf031 Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Wed, 24 Jul 2024 14:34:39 +0900
Subject: [PATCH 06/21] modify : Pretraining VGG from scrach

add packagge install code
---
 beginner_source/Pretraining_Vgg_from_scratch.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 3d387a484b..c38183ff01 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -15,11 +15,12 @@
 and seasoned practitioners looking to deepen their understanding of CNN
 architectures.
 
+Before you start 
 
-"""
-
+.. code-block:: sh
 
-######################################################################
+    pip install albumentations
+"""
 
 
 

From af08545747004a5d72a437c4a5c32db049656796 Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Wed, 24 Jul 2024 14:36:08 +0900
Subject: [PATCH 07/21] modify : Pretraining VGG from scrach

add packagge install code
---
 beginner_source/Pretraining_Vgg_from_scratch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index c38183ff01..7edaf6d889 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -56,7 +56,7 @@
 # -  We train the model from scratch using only the configuration
 #    presented in the paper.
 # 
-#    -  we do not use future method, like Batch normalization,Adam , He
+#    -  we do not use future method, like ``Batch normalization``,Adam , He
 #       initialization.
 # 
 # -  You can apply to ImageNet Data.
@@ -150,7 +150,7 @@
 ######################################################################
 # We use ``CIFAR100`` Dataset in this tutorial. In VGG paper , the authors
 # scales image isotropically . Then , they apply
-# Normalization,``RandomCrop``,``HorizontalFlip`` . So , we need to override
+# ``Normalization``,``RandomCrop``,``HorizontalFlip`` . So , we need to override
 # CIFAR100 class to apply preprocessing.
 # 
 

From 95b16d43ba3a6db015fefc8383c82e7e50499cf3 Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Wed, 24 Jul 2024 14:50:59 +0900
Subject: [PATCH 08/21] modify : Pretraining VGG from scratch

remove tqdm
---
 beginner_source/Pretraining_Vgg_from_scratch.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 7edaf6d889..9f4e6d8cb0 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -29,7 +29,7 @@
 import torch.optim as optim
 import albumentations as A
 import numpy as np
-
+import torch
 
 from torchvision.datasets import CIFAR100,CIFAR10,MNIST,ImageNet
 import os
@@ -409,10 +409,11 @@ def accuracy(output, target, topk=(1,)):
 # on accuracy.
 # 
 
+model_version='B'
 model = Model_vgg(model_version,num_classes)
 criterion = nn.CrossEntropyLoss()
 
-optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay,nesterov=nestrov,momentum=momentum)
+optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay,momentum=momentum)
 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max',patience=10,threshold=1e-3,eps = 1e-5)
 
 
@@ -447,7 +448,7 @@ def accuracy(output, target, topk=(1,)):
 
 grad_clip = 1.0
 
-for e in range(epoch-resume_epoch) :
+for e in range(epoch) :
     print(f'Training Epoch : {e}')
     total_loss = 0
     val_iter = iter(val_loader)
@@ -456,7 +457,7 @@ def accuracy(output, target, topk=(1,)):
 
     total_acc = [0,0]
     count= 0
-    for i , data in tqdm(enumerate(train_loader)) :
+    for i , data in enumerate(train_loader) :
 
 
         model.train()
@@ -501,7 +502,7 @@ def accuracy(output, target, topk=(1,)):
             val_loss = 0
             torch.cuda.empty_cache()
 
-            for j   in tqdm(range(update_count)) :
+            for j   in range(update_count) :
                 loss = None
                 print(f'Evaluation Steps Start')
                 try :

From 2c891c96f1701042566d689816a262ef1fa913e6 Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Sat, 10 Aug 2024 08:10:55 +0900
Subject: [PATCH 09/21] modify : VGG tutorial

pyspell check,add albumentations installation code
---
 .../Pretraining_Vgg_from_scratch.py           | 20 +++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 9f4e6d8cb0..8b37e3b9ae 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -21,8 +21,16 @@
 
     pip install albumentations
 """
+import subprocess
+import sys
 
-
+try:
+    import albumentations
+    print("albumentations are already installed")
+except ImportError:
+    print("albumentations module not found. Installing...")
+    subprocess.check_call([sys.executable, "-m", "pip", "install", "albumentations"])
+    print("albumentations module installed successfully.")
 
 
 
@@ -77,7 +85,7 @@
 ######################################################################
 # VGG became a model that attracted attention because it succeeded in
 # building deeper layers and dramatically shortening the training time
-# compared to alexnet, which was the SOTA model at the time.:
+# compared to ``alexnet``, which was the SOTA model at the time.:
 # 
 
 
@@ -97,7 +105,7 @@
 ## model configuration
 
 num_classes =   100
-# Caltech 257 CIFAR 100  CIFAR10 10 ,MNIST 10 ImageNet 1000
+# ``Caltech`` 257 CIFAR 100  CIFAR10 10 ,MNIST 10 ImageNet 1000
 model_version = None ## you must configure it.
 
 ## data configuration
@@ -120,7 +128,7 @@
 
 update_count = int(256/batch_size)
 accum_step = int(256/batch_size)
-eval_step =26 * accum_step  ## Caltech 5 CIFAR 5 MNIST 6 , CIFAR10 5 ImageNet  26
+eval_step =26 * accum_step  ## ``Caltech`` 5 CIFAR 5 MNIST 6 , CIFAR10 5 ImageNet  26
 
 
 ## model configuration
@@ -149,7 +157,7 @@
 
 ######################################################################
 # We use ``CIFAR100`` Dataset in this tutorial. In VGG paper , the authors
-# scales image isotropically . Then , they apply
+# scales image ``isotropically`` . Then , they apply
 # ``Normalization``,``RandomCrop``,``HorizontalFlip`` . So , we need to override
 # CIFAR100 class to apply preprocessing.
 # 
@@ -637,7 +645,7 @@ def __getitem__(self, index: int) :
                 )
 
 ######################################################################
-# Conculsion
+# Conclusion
 # ----------
 # We have seen how ``pretraining`` VGG from scratch . This Tutorial will be helpful to reproduce another Foundation Model .
 

From f706a64e78eede37246559b3ad521c23d7807408 Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Mon, 16 Sep 2024 14:03:04 +0900
Subject: [PATCH 10/21] modify : VGG tutorial

Lint and Technical Details and flexible code style
---
 .ci/docker/requirements.txt                   |   1 +
 .../Pretraining_Vgg_from_scratch.py           | 129 ++++++++++--------
 2 files changed, 74 insertions(+), 56 deletions(-)

diff --git a/.ci/docker/requirements.txt b/.ci/docker/requirements.txt
index 00cf2f2103..b1783d2170 100644
--- a/.ci/docker/requirements.txt
+++ b/.ci/docker/requirements.txt
@@ -1,6 +1,7 @@
 # --extra-index-url https://download.pytorch.org/whl/cu117/index.html # Use this to run/publish tutorials against the latest binaries during the RC stage. Comment out after the release. Each release verify the correct cuda version.
 # Refer to ./jenkins/build.sh for tutorial build instructions
 
+albumentations
 sphinx==5.0.0
 sphinx-gallery==0.11.1
 sphinx_design
diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 8b37e3b9ae..ae066cd442 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -5,9 +5,9 @@
 
 **Author:** `WoongJoon Choi <https://github.com/woongjoonchoi>`_
 
-In this tutorial, we will embark on an exciting journey to build and
-train a VGG network from scratch using Python and popular deep learning
-libraries such as PyTorch. We will dive into the details of the VGG
+VGG (Visual Geometry Group) is a convolutional neural network architecture that is particularly
+efficient in image classification tasks. In this tutorial, we will guide you through building
+and training a VGG network from scratch using Python and PyTorch. We will dive into the details of the VGG
 architecture, understanding its components and the rationale behind its
 design.
 
@@ -15,11 +15,23 @@
 and seasoned practitioners looking to deepen their understanding of CNN
 architectures.
 
-Before you start 
+.. grid:: 2
+
+    .. grid-item-card:: :octicon:`mortar-board;1em;` What you will learn
+       :class-card: card-prerequisites
+
+       * Understand the VGG architecture and train it from scratch using PyTorch.
+       * Use PyTorch tools to evaluate the VGG model's performance
+
+    .. grid-item-card:: :octicon:`list-unordered;1em;` Prerequisites
+       :class-card: card-prerequisites
+
+       * Complete the `Learn the Basics tutorials <https://pytorch.org/tutorials/beginner/basics/intro.html>`__
+       * Familiarity with basic machine learning concepts and terms 
+
+If you are running this in Google Colab, install albumentations
 
-.. code-block:: sh
 
-    pip install albumentations
 """
 import subprocess
 import sys
@@ -55,7 +67,7 @@
 
 
 ######################################################################
-# Worth point of this tutorial
+# Purpose point of this tutorial
 # ----------------------------
 # 
 
@@ -77,7 +89,7 @@
 
 
 ######################################################################
-# Why VGG is so popular ?
+# Background
 # -----------------------
 # 
 
@@ -85,9 +97,15 @@
 ######################################################################
 # VGG became a model that attracted attention because it succeeded in
 # building deeper layers and dramatically shortening the training time
-# compared to ``alexnet``, which was the SOTA model at the time.:
+# compared to ``AlexNet``, which was the SOTA model at the time.
 # 
-
+# Unlike ``AlexNet``'s 5x5 9x9 filters, VGG only uses 3x3 filters. 
+# Using multiple 3x3 filters can obtain the same receptive field as using a 5x5 filter, but it is effective in reducing the number of parameters. 
+# In addition, since it passes through multiple nonlinear functions, the nonlinearity increases even more.
+# 
+# VGG applied a max pooling layer after multiple convolutional layers to reduce the spatial size. 
+# This allowed the feature map to be downsampled while preserving important information. 
+# Thanks to this, the network could learn high-dimensional features in deeper layers and prevent overfitting.
 
 ######################################################################
 # VGG Configuration
@@ -96,11 +114,11 @@
 
 
 ######################################################################
-# We define some configurations suggested in VGG paper . Details about
+# We define some configurations suggested in VGG paper. Details of
 # this configuration will be explained below section.
 # 
 
-DatasetName = 'Cifar' # CIFAR ,CIFAR10, MNIST , ImageNet
+DatasetName = 'CIFAR' # CIFAR, CIFAR10, MNIST, ImageNet
 
 ## model configuration
 
@@ -143,8 +161,8 @@
 
 
 ######################################################################
-# | If your GPU memory is 24GB ,The maximum batch size is 128. But, if you
-#   use Colab , We recommend using 32 .
+# | If your GPU memory is 24GB, the maximum batch size is 128. But if you
+#   use Colab, we recommend using 32GB.
 # | You can modify the batch size according to your preference.
 # 
 
@@ -156,10 +174,15 @@
 
 
 ######################################################################
-# We use ``CIFAR100`` Dataset in this tutorial. In VGG paper , the authors
-# scales image ``isotropically`` . Then , they apply
-# ``Normalization``,``RandomCrop``,``HorizontalFlip`` . So , we need to override
-# CIFAR100 class to apply preprocessing.
+# We use the ``CIFAR100`` dataset in this tutorial. In VGG paper, the authors scales image ``isotropically`` . 
+# ``Isotropiclay scale up``  is a method of increasing the size of an image while maintaining its proportions, preventing distortion and maintaining the consistency of the object.
+# Then they apply ``Normalization``,``RandomCrop``,``HorizontalFlip`` .
+# Normalizing input data to a range of 0 to 1 tends to lead to faster convergence of the model. This is because the weight updates are more uniform. In particular, neural network models can have significantly different weight updates depending on the range of input values.
+# Neural network models generally work best when the input data is within a certain range (e.g. 0 to 1). If RGB values ​​are not normalized, the model is fed input values ​​of different ranges, which makes it difficult for the model to process the data in a consistent manner. Normalization allows all data to be scaled to the same scale, which allows the model to treat each feature more evenly, improving performance.
+# If the training and test data have different ranges, the model may have difficulty generalizing. Therefore, it is important to fit the values ​​to the same range across all data. This allows the model to perform well on both test data and real data.
+# Using normalized images as input allows the neural network to learn more effectively and show stable performance.
+# Data augmentation, such as ``RandomCrop`` and ``HorizontalFlip``, is a very useful technique for improving the performance of deep learning models, preventing overfitting, and helping models to work robustly in various environments. In particular, when the dataset is small or limited, data augmentation can secure more data, and the model can show better generalization performance by learning various transformed data.
+# So we need to override CIFAR100 class to apply preprocessing.
 # 
 
 class Custom_Cifar(CIFAR100) :
@@ -224,10 +247,8 @@ def __getitem__(self, index: int) :
 
 
 ######################################################################
-# | In VGG paper, they do experiment over 6 models. model A is 11 layers,
-#   model B is 13 layers, model C is 16 layers , model D is 16 layers and
-#   model D is 19 layers . you can train all version of models to
-#   reproduce VGG .
+# | The VGG paper experiments over 6 models of varying layer depth. The various configurations
+# | are enumerated below for full reproduction of the results.
 # | ``Config_Channels`` means output channels and ``Config_kernels`` means
 #   kernel size.
 # 
@@ -236,26 +257,26 @@ def __getitem__(self, index: int) :
 from torch import nn
 
 
-Config_channels = {
-"A" : [64,"M" , 128,  "M"  , 256,256,"M" ,512,512 ,"M" , 512,512,"M"] ,
-"A_lrn" : [64,"LRN","M" , 128,  "M"  , 256,256,"M" ,512,512 ,"M" , 512,512,"M"] ,
-"B" :[64,64,"M" , 128,128,  "M"  , 256,256,"M" ,512,512 ,"M" , 512,512,"M"]  ,
-"C" : [64,64,"M" , 128,128,  "M"  , 256,256,256,"M" ,512,512 ,512,"M" , 512,512,512,"M"] ,
-"D" :[64,64,"M" , 128,128,  "M"  , 256,256,256,"M" ,512,512 ,512,"M" , 512,512,512,"M"] ,
-"E" :[64,64,"M" , 128,128,  "M"  , 256,256,256,256,"M" ,512,512 ,512,512,"M" , 512,512,512,512,"M"]         ,
+# Config_channels -> number : output_channels , "M": max_pooling layer
 
+Config_channels = {
+"A":[64,"M",128,"M",256,256,"M",512,512,"M",512,512,"M"],
+"A_lrn":[64,"LRN","M",128,"M",256,256,"M",512,512,"M",512,512,"M"],
+"B":[64,64,"M",128,128,"M",256,256,"M",512,512,"M",512,512,"M"],
+"C":[64,64,"M",128,128,"M",256,256,256,"M",512,512,512,"M",512,512,512,"M"],
+"D":[64,64,"M",128,128,"M",256,256,256,"M",512,512,512,"M",512,512,512,"M"],
+"E":[64,64,"M",128,128,"M",256,256,256,256,"M",512,512,512,512,"M",512,512,512,512,"M"],
 }
 
 
-
+# Config_kernel ->  kernel_size
 Config_kernel = {
-"A" : [3,2 , 3,  2  , 3,3,2 ,3,3 ,2 , 3,3,2] ,
-"A_lrn" : [3,2,2 , 3,  2  , 3,3,2 ,3,3 ,2 , 3,3,2] ,
-"B" :[3,3,2 , 3,3,  2  , 3,3,2 ,3,3 ,2 , 3,3,2]  ,
-"C" : [3,3,2 , 3,3,  2  , 3,3,1,2 ,3,3 ,1,2 , 3,3,1,2] ,
-"D" :[3,3,2 , 3,3,  2  , 3,3,3,2 ,3,3 ,3,2 , 3,3,3,2] ,
-"E" :[3,3,2 , 3,3,  2  , 3,3,3,3,2 ,3,3 ,3,3,2 , 3,3,3,3,2]         ,
-
+"A":[3,2,3,2,3,3,2,3,3,2,3,3,2],
+"A_lrn":[3,2,2,3,2,3,3,2,3,3,2,3,3,2],
+"B":[3,3,2,3,3,2,3,3,2,3,3,2,3,3,2],
+"C":[3,3,2,3,3,2,3,3,1,2,3,3,1,2,3,3,1,2],
+"D":[3,3,2,3,3,2,3,3,3,2,3,3,3,2,3,3,3,2],
+"E":[3,3,2,3,3,2,3,3,3,3,2,3,3,3,3,2,3,3,3,3,2],
 }
 
 
@@ -284,7 +305,8 @@ def make_feature_extractor(cfg_c,cfg_k):
 
 
 class Model_vgg(nn.Module) :
-    def __init__(self,version , num_classes):
+    # def __init__(self,version , num_classes):
+    def __init__(self,conf_channels,conf_kernels , num_classes):
         conv_5_out_w ,conv_5_out_h = 7,7
         conv_5_out_dim =512
         conv_1_by_1_1_outchannel = 4096
@@ -296,7 +318,7 @@ def __init__(self,version , num_classes):
         self.except_xavier  = except_xavier
 
         super().__init__()
-        self.feature_extractor = make_feature_extractor(Config_channels[version] , Config_kernel[version])
+        self.feature_extractor = make_feature_extractor(conf_channels, conf_kernels)
         self.avgpool = nn.AdaptiveAvgPool2d((1,1))
         self.output_layer = nn.Sequential(
                              nn.Conv2d(conv_5_out_dim  ,conv_1_by_1_1_outchannel ,7) ,
@@ -354,17 +376,12 @@ def _init_weights(self,m):
             nn.init.constant_(m.bias, 0)
 
 
-######################################################################
-# Parameter Initialization
-# ~~~~~~~~~~~~~~~~~~~~~~~~
-# 
-
 
 ######################################################################
-# When training VGG , the authors first train model A , then initialized
-# the weights of other models with the weights of model A. Waiting for
+# When training VGG, the authors first train model A, then continue training from
+# the resultant weights for other variants. Waiting for
 # Model A to be trained takes a long time . The authors mention how to
-# train with ``xavier`` initialization rather than initializing with the
+# train with ``Xavier`` initialization rather than initializing with the
 # weights of model A. But, they do not mention how to initialize .
 # 
 # | To Reproduce VGG , we use ``xavier`` initialization method to initialize
@@ -418,7 +435,7 @@ def accuracy(output, target, topk=(1,)):
 # 
 
 model_version='B'
-model = Model_vgg(model_version,num_classes)
+model = Model_vgg(Config_channels[model_version],Config_kernel[model_version],num_classes)
 criterion = nn.CrossEntropyLoss()
 
 optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay,momentum=momentum)
@@ -575,7 +592,7 @@ def accuracy(output, target, topk=(1,)):
 # -------------------
 # 
 
-class Cusotm_ImageNet(ImageNet) :
+class Custom_ImageNet(ImageNet) :
     def __init__(self,root,transform = None,multi=False,s_max=None,s_min=256,split=None,val=False):
 
         self.multi = multi
@@ -615,7 +632,6 @@ def __getitem__(self, index: int) :
         if img.mode == 'L' : img = img.convert('RGB')
         img=np.array(img,dtype=np.float32)
 
-        # print(self.transform)
 
         if self.transform is not None:
             img = self.transform(image=img)
@@ -631,8 +647,8 @@ def __getitem__(self, index: int) :
         return img, target
 
 if DatasetName == 'ImageNet' :
-    train_data= Cusotm_ImageNet(root='ImageNet',split='train')
-    val_data= Cusotm_ImageNet('ImageNet',split='val',val=True)
+    train_data= Custom_ImageNet(root='ImageNet',split='train')
+    val_data= Custom_ImageNet('ImageNet',split='val',val=True)
     val_data.val= True
     val_data.s_min = test_min
     val_data.transform=    A.Compose(
@@ -647,14 +663,15 @@ def __getitem__(self, index: int) :
 ######################################################################
 # Conclusion
 # ----------
-# We have seen how ``pretraining`` VGG from scratch . This Tutorial will be helpful to reproduce another Foundation Model .
+# We have seen how ``pretraining`` VGG from scratch . 
+# This Tutorial will be helpful to reproduce another Foundation Model .
 
 ######################################################################
 # More things to try
 # ------------------
-# - Trying On ImageNet
-# - Try All version of Model
-# - Try All evaluation method in VGG paper
+# - Apply model to ImageNet
+# - Try all model variants
+# - Try additional evaluation method
 
 
 ######################################################################

From 993d32ae49deec80ededac379c26d49596feb2df Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 16 Sep 2024 11:22:07 -0700
Subject: [PATCH 11/21] Update Pretraining_Vgg_from_scratch.py

Editorial and formatting fixes
---
 .../Pretraining_Vgg_from_scratch.py           | 293 ++++++++----------
 1 file changed, 135 insertions(+), 158 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index ae066cd442..14e6bf4f42 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -27,10 +27,41 @@
        :class-card: card-prerequisites
 
        * Complete the `Learn the Basics tutorials <https://pytorch.org/tutorials/beginner/basics/intro.html>`__
-       * Familiarity with basic machine learning concepts and terms 
+       * PyTorch 2.4 or later
+       * We recommend to run this tutorial on GPU
+       
+Overview
+------------
 
-If you are running this in Google Colab, install albumentations
+​​VGG is a model that attracted attention due to its ability to build deeper layers and dramatically
+shorten the training time compared to AlexNet, which was the state-of-the-art model at the time of the publishing
+of the `original paper <https://arxiv.org/abs/1409.1556>`__.
 
+Unlike AlexNet's 5x5 and 9x9 filters, VGG uses only 3x3 filters. Using multiple 3x3 filters can
+obtain the same receptive field as using a 5x5 filter, but it is effective in reducing the number
+of parameters. In addition, since it passes through multiple nonlinear functions, the
+nonlinearity increases even more.
+
+VGG applies a max pooling layer after multiple convolutional layers to reduce the spatial size.
+This allows the feature map to be downsampled while preserving important information. Thanks
+to this, the network can learn high-dimensional features in deeper layers and prevent overfitting.
+
+In this tutorial, we will train the VGG model from scratch using only the configuration presented
+in the original VGG paper. We will not use future methods such as batch normalization, Adam optimization, or
+He initialization. The trained model can be applied to ImageNet data, and you can learn
+VGG within the training time suggested in the paper.
+
+Setup
+--------
+
+.. note:: if you are running this in Google Colab, install ``albumentations`` by running:
+
+   .. code-block:: python
+   
+      !pip3 install albumentations``
+
+
+First, let's import the required dependencies:
 
 """
 import subprocess
@@ -55,68 +86,17 @@
 import os
 from PIL import Image
 
-
-
-
-
-######################################################################
-# We recommend using GPU for this tutorial.
-# 
-
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 
 
-######################################################################
-# Purpose point of this tutorial
-# ----------------------------
-# 
-
-
-######################################################################
-# -  We train the model from scratch using only the configuration
-#    presented in the paper.
-# 
-#    -  we do not use future method, like ``Batch normalization``,Adam , He
-#       initialization.
-# 
-# -  You can apply to ImageNet Data.
-# 
-#    -  If you can download the ImageNet Data(140GB), you can apply this
-#       tutorial to reproduce Original VGG.
-# 
-# -  You can learn VGG within the training time suggested in the paper.
-# 
-
-
-######################################################################
-# Background
-# -----------------------
-# 
-
-
-######################################################################
-# VGG became a model that attracted attention because it succeeded in
-# building deeper layers and dramatically shortening the training time
-# compared to ``AlexNet``, which was the SOTA model at the time.
-# 
-# Unlike ``AlexNet``'s 5x5 9x9 filters, VGG only uses 3x3 filters. 
-# Using multiple 3x3 filters can obtain the same receptive field as using a 5x5 filter, but it is effective in reducing the number of parameters. 
-# In addition, since it passes through multiple nonlinear functions, the nonlinearity increases even more.
-# 
-# VGG applied a max pooling layer after multiple convolutional layers to reduce the spatial size. 
-# This allowed the feature map to be downsampled while preserving important information. 
-# Thanks to this, the network could learn high-dimensional features in deeper layers and prevent overfitting.
-
 ######################################################################
 # VGG Configuration
 # -----------------
-# 
-
-
-######################################################################
-# We define some configurations suggested in VGG paper. Details of
-# this configuration will be explained below section.
-# 
+#
+# In this section, we will define configurations suggested in the VGG paper. 
+# We use the CIFAR100 dataset. The authors of the VGG paper scale images isotropically,
+# which means increasing the size of an image while maintaining its proportions,
+# preventing distortion and maintaining the consistency of the object.
 
 DatasetName = 'CIFAR' # CIFAR, CIFAR10, MNIST, ImageNet
 
@@ -143,12 +123,10 @@
 epoch = 10
 clip= None # model D grad clip 0.7
 
-
 update_count = int(256/batch_size)
 accum_step = int(256/batch_size)
 eval_step =26 * accum_step  ## ``Caltech`` 5 CIFAR 5 MNIST 6 , CIFAR10 5 ImageNet  26
 
-
 ## model configuration
 xavier_count= 4
 
@@ -159,30 +137,39 @@
 model_layers =None
 
 
-
 ######################################################################
-# | If your GPU memory is 24GB, the maximum batch size is 128. But if you
-#   use Colab, we recommend using 32GB.
-# | You can modify the batch size according to your preference.
+# .. note:: In the code above, we have defined the batch size as 32,
+#    which is recommended for Google Colab. However, if you are
+#    running this code on a machine with 24GB of GPU memory,
+#    you can set the batch size to 128. You can modify the batch
+#    size according to your preference and hardware capabilities.
 # 
 
-
 ######################################################################
-# Define dataset
-# --------------
+# Defining the dataset
+# --------------------
 # 
-
-
-######################################################################
-# We use the ``CIFAR100`` dataset in this tutorial. In VGG paper, the authors scales image ``isotropically`` . 
-# ``Isotropiclay scale up``  is a method of increasing the size of an image while maintaining its proportions, preventing distortion and maintaining the consistency of the object.
-# Then they apply ``Normalization``,``RandomCrop``,``HorizontalFlip`` .
-# Normalizing input data to a range of 0 to 1 tends to lead to faster convergence of the model. This is because the weight updates are more uniform. In particular, neural network models can have significantly different weight updates depending on the range of input values.
-# Neural network models generally work best when the input data is within a certain range (e.g. 0 to 1). If RGB values ​​are not normalized, the model is fed input values ​​of different ranges, which makes it difficult for the model to process the data in a consistent manner. Normalization allows all data to be scaled to the same scale, which allows the model to treat each feature more evenly, improving performance.
-# If the training and test data have different ranges, the model may have difficulty generalizing. Therefore, it is important to fit the values ​​to the same range across all data. This allows the model to perform well on both test data and real data.
-# Using normalized images as input allows the neural network to learn more effectively and show stable performance.
-# Data augmentation, such as ``RandomCrop`` and ``HorizontalFlip``, is a very useful technique for improving the performance of deep learning models, preventing overfitting, and helping models to work robustly in various environments. In particular, when the dataset is small or limited, data augmentation can secure more data, and the model can show better generalization performance by learning various transformed data.
-# So we need to override CIFAR100 class to apply preprocessing.
+# As mentioned above we use the CIFAR100 dataset in this tutorial. According to the VGG paper,
+# the authors scale the images isotropically to maintain their proportions. This method, known
+# as isotropic scaling, increases the size of an image while preserving its aspect ratio,
+# thus avoiding distortion and maintaining object consistency. 
+#
+# After scaling the images, several preprocessing techniques are applied including normalization,
+# random crop, and horizontal flip. Normalization adjusts the input data to a range of 0 to 1,
+# which typically leads to faster convergence during model training. It ensures that all features
+# are scaled to the same range, allowing the model to process each feature more evenly and
+# improve overall performance. It is crucial to normalize both training and test data to the
+# same range to ensure the model generalizes well to new, unseen data.
+#
+# Data augmentation techniques like random crop and horizontal flip are crucial for enhancing
+# the performance of deep learning models. They help prevent overfitting and ensure that the
+# model performs robustly under various conditions. Particularly in scenarios where the dataset
+# is small or limited, these techniques effectively increase the amount of training data.
+# By exposing the model to various transformations of the data, it learns to generalize better,
+# thus improving its performance on both test data and in real-world applications.
+#
+# To apply preprocessing, we need to override the CIFAR100 class that we have imported from the
+# ``torchvision.datasets`` with a custom class:
 # 
 
 class Custom_Cifar(CIFAR100) :
@@ -234,24 +221,20 @@ def __getitem__(self, index: int) :
 
         if self.target_transform is not None:
             target = self.target_transform(target)
-
         img=img.transpose((2,0,1))
-
         return img, target
 
-
 ######################################################################
 # Define Model
 # ------------
 # 
-
-
-######################################################################
-# | The VGG paper experiments over 6 models of varying layer depth. The various configurations
-# | are enumerated below for full reproduction of the results.
-# | ``Config_Channels`` means output channels and ``Config_kernels`` means
-#   kernel size.
-# 
+# The VGG paper explores six different model configurations, each with varying layer depths.
+# To fully reproduce the results, we will define these configurations below.
+#
+# We will use two main components to define the model:
+#
+# * ``Config_channels``: This refers to the number of output channels for each layer.
+# * ``Config_kernels``: This refers to the kernel size (or filter size) for each layer.
 
 import torch
 from torch import nn
@@ -281,7 +264,7 @@ def __getitem__(self, index: int) :
 
 
 ######################################################################
-# We define model class that generate model in choice of 6 versions.
+# Next, we define a model class that generates a model with a choice of six versions.
 # 
 
 def make_feature_extractor(cfg_c,cfg_k):
@@ -378,38 +361,46 @@ def _init_weights(self,m):
 
 
 ######################################################################
-# When training VGG, the authors first train model A, then continue training from
-# the resultant weights for other variants. Waiting for
-# Model A to be trained takes a long time . The authors mention how to
-# train with ``Xavier`` initialization rather than initializing with the
-# weights of model A. But, they do not mention how to initialize .
+# Initializing Model Weights
+# ----------------------------
+#
+# ggIn the original VGG paper, the authors trained model A first and then
+# used its weights as a starting point for training other variants. However,
+# this approach can be time-consuming. The authors also mentioned using Xavier
+# initialization as an alternative to initializing with model A's weights,
+# but they did not provide specific details on how to implement it.
 # 
-# | To Reproduce VGG , we use ``xavier`` initialization method to initialize
-#   weights. We apply initialization to few first layers and last layers.
-#   Then , we apply random initialization to other layers.
-# | **we must fix standard deviation to 0.1**. If standard deviation is
-#   larger than 0.1, the weight get NAN values. For stability, we use 0.1
-#   for standard deviation.
-# | The ``front_xavier`` means how many layers we initialize with ``xavier``
-#   initialization in front of layers and The ``last_xavier`` means how
-#   many layers we initialize with ``xavier`` initialization in last of
-#   layers.
+# To reproduce the VGG results, we will use the Xavier initialization method
+# to initialize the model weights. Specifically, we will apply Xavier
+# initialization to the first few layers and the last few layers, while using
+# random initialization for the remaining layers.
+
+# .. note::
+#    To ensure stability, we must set the standard deviation of the initialization
+#    to 0.1. Using a larger standard deviation can result in NaN (Not a Number)
+#    values in the weights.
+#
+# We introduce two hyperparameters to control the Xavier initialization:
+
+# * ``front_xavier:`` The number of layers at the beginning of the network that are
+# initialized using Xavier initialization.
+#
+# * ``last_xavier:`` The number of layers at the end of the network that are initialized
+#   using Xavier initialization.
 # 
-# In My experiment, we can use ``front_xavier`` = 4 , ``last_xavier``\ =5
-# in model A, ``front_xavier`` =4 ``last_xavier``\ =7 in model B,C , D and
-# ``front_xavier`` =5\ ``last_xavier``\ = 9 in model E . These values work
-# fine.
+# Based on our experiments, we recommend the following settings:
+#
+# * For model A: ``front_xavier`` = 4, ``last_xavier`` = 5
+# * For models B, C, and D: ``front_xavier`` = 4, ``last_xavier`` = 7
+# * For model E: ``front_xavier`` = 5, ``last_xavier`` = 9
 # 
-
+# These values have been found to work well in practice.
 
 ######################################################################
-# Training Model
-# --------------
+# Training the Model
+# ------------------
 # 
-
-
-######################################################################
-# We will define top-k error.
+# First, let's define top-k error.
 # 
 
 def accuracy(output, target, topk=(1,)):
@@ -429,9 +420,8 @@ def accuracy(output, target, topk=(1,)):
 
 
 ######################################################################
-# we initiate model and loss function and optimizer and schedulers. In
-# VGG, they use softmax output ,Momentum Optimizer , and Scheduling based
-# on accuracy.
+# Next, we initiate the model and loss function, optimizer and schedulers. In the VGG model,
+# they use a softmax output, Momentum Optimizer, and scheduling based on accuracy.
 # 
 
 model_version='B'
@@ -441,37 +431,27 @@ def accuracy(output, target, topk=(1,)):
 optimizer = optim.SGD(model.parameters(), lr=lr, weight_decay=weight_decay,momentum=momentum)
 scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max',patience=10,threshold=1e-3,eps = 1e-5)
 
-
-
 ######################################################################
-# we use ``CIFAR100`` .
-# 
+# As mentioned above, we are using the ``CIFAR100`` dataset and set gradient
+# clipping to 1.0 to prevent gradient exploding.
+
 
 if DatasetName == 'Cifar' :
     train_data = Custom_Cifar(root=os.getcwd(),download=True)
     val_data  = Custom_Cifar(root=os.getcwd(),train=False,download=True)
     val_data.val= True
     val_data.s_min = test_min
-    val_data.transform=    A.Compose(
-                    [
+    val_data.transform=    A.Compose([
                         A.Normalize(mean =(0.5071, 0.4867, 0.4408) , std = (0.2675, 0.2565, 0.2761)),
                         A.SmallestMaxSize(max_size=val_data.S),
                         A.CenterCrop(height =224,width=224)
-                    ]
-
-                )
-
-train_loader = torch.utils.data.DataLoader(train_data,batch_size= batch_size,shuffle = True , num_workers=4,pin_memory = True,prefetch_factor = 2,drop_last = True)
-val_loader = torch.utils.data.DataLoader(val_data,batch_size= batch_size,shuffle = True , num_workers=4,pin_memory = True,prefetch_factor = 2,drop_last = True)
-
-
-######################################################################
-# we set grad_clip to 1.0 for prevent gradient exploding.
-# 
+                    ])
+    train_loader = torch.utils.data.DataLoader(train_data,batch_size= batch_size,shuffle = True , num_workers=4,pin_memory = True,prefetch_factor = 2,drop_last = True)
+    val_loader = torch.utils.data.DataLoader(val_data,batch_size= batch_size,shuffle = True , num_workers=4,pin_memory = True,prefetch_factor = 2,drop_last = True)
 
 model = model.to(device)
 
-grad_clip = 1.0
+grad_clip = 1.0 # setting gradient clipping to 1.0
 
 for e in range(epoch) :
     print(f'Training Epoch : {e}')
@@ -588,9 +568,10 @@ def accuracy(output, target, topk=(1,)):
 
 
 ######################################################################
-# (Optional) ImageNet
-# -------------------
-# 
+# (Optional) Additional Exercise: ImageNet
+# --------------------------------------------
+#
+# You can apply the same model that we have trained above with another popular dataset called ImageNet:  
 
 class Custom_ImageNet(ImageNet) :
     def __init__(self,root,transform = None,multi=False,s_max=None,s_min=256,split=None,val=False):
@@ -663,20 +644,16 @@ def __getitem__(self, index: int) :
 ######################################################################
 # Conclusion
 # ----------
-# We have seen how ``pretraining`` VGG from scratch . 
-# This Tutorial will be helpful to reproduce another Foundation Model .
-
-######################################################################
-# More things to try
-# ------------------
-# - Apply model to ImageNet
-# - Try all model variants
-# - Try additional evaluation method
-
-
-######################################################################
-# Further Reading
-# ---------------
-
-# - `VGG training using python script <https://github.com/woongjoonchoi/DeepLearningPaper-Reproducing/tree/master/VGG>`__
-# - `VGG paper <https://arxiv.org/abs/1409.1556>`__
\ No newline at end of file
+# 
+# In this tutorial, we have successfully demonstrated how to pretrain the VGG model
+# from scratch. The techniques and insights provided in this tutorial can serve as
+# a basis for reproducing and adapting other foundational models.
+# 
+# If you are looking to expand your knowledge and application of the VGG model,
+# consider exploring further by applying the model to the ImageNet dataset, experimenting
+# with different model variants, and incorporating additional evaluation methods to
+# enhance model robustness and performance.
+# 
+# For more information, see: 
+#
+# - `Very Deep Convolutional Networks for Large-Scale Image Recognition <https://arxiv.org/abs/1409.1556>`__

From d25d920e501b51ebf9e54d75236e3ee93d4b3d4b Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 16 Sep 2024 11:23:12 -0700
Subject: [PATCH 12/21] Apply suggestions from code review

Co-authored-by: Joel Schlosser <75754324+jbschlosser@users.noreply.github.com>
---
 beginner_source/Pretraining_Vgg_from_scratch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 14e6bf4f42..ca0888af64 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -1,5 +1,5 @@
 """
-``Pretraining`` VGG from scratch 
+Pretraining VGG from scratch
 ============================
 
 

From e6c7a9b8765b79e5fb0971ba71f5cfe46c701e27 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 16 Sep 2024 11:23:46 -0700
Subject: [PATCH 13/21] Apply suggestions from code review

---
 beginner_source/Pretraining_Vgg_from_scratch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index ca0888af64..4f4dd0c280 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -11,7 +11,7 @@
 architecture, understanding its components and the rationale behind its
 design.
 
-Our tutorial is designed for both beginners who are new to deep learning
+This tutorial is designed for both beginners who are new to deep learning
 and seasoned practitioners looking to deepen their understanding of CNN
 architectures.
 

From be72b0139b2b62a0afef04d1195ddd63ed14108e Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Mon, 16 Sep 2024 12:56:30 -0700
Subject: [PATCH 14/21] Fix indentation

---
 .../Pretraining_Vgg_from_scratch.py           | 190 +++++++++---------
 1 file changed, 95 insertions(+), 95 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 4f4dd0c280..3192b13f3c 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -449,103 +449,103 @@ def accuracy(output, target, topk=(1,)):
     train_loader = torch.utils.data.DataLoader(train_data,batch_size= batch_size,shuffle = True , num_workers=4,pin_memory = True,prefetch_factor = 2,drop_last = True)
     val_loader = torch.utils.data.DataLoader(val_data,batch_size= batch_size,shuffle = True , num_workers=4,pin_memory = True,prefetch_factor = 2,drop_last = True)
 
-model = model.to(device)
-
-grad_clip = 1.0 # setting gradient clipping to 1.0
-
-for e in range(epoch) :
-    print(f'Training Epoch : {e}')
-    total_loss = 0
-    val_iter = iter(val_loader)
-    train_acc=[0,0]
-    train_num = 0
-
-    total_acc = [0,0]
-    count= 0
-    for i , data in enumerate(train_loader) :
-
-
-        model.train()
-        img,label= data
-        img,label =img.to(device, non_blocking=True) ,label.to(device, non_blocking=True)
-
-        output = model(img)
-
-        loss = criterion(output,label) /accum_step
-
-        temp_output ,temp_label = output.detach().to('cpu') , label.detach().to('cpu')
-        temp_acc = accuracy(temp_output,temp_label,(1,5))
-        train_acc=[train_acc[0]+temp_acc[0] , train_acc[1]+temp_acc[1]]
-        train_num+=batch_size
-        temp_output,temp_label,temp_acc = None,None,None
-
-        loss.backward()
-        total_loss += loss.detach().to('cpu')
-        img,label=None,None
-        torch.cuda.empty_cache()
-        if i> 0 and i%update_count == 0 :
-            print(f'Training steps : {i}  parameter update loss :{total_loss} ')
-            if grad_clip is not None:
-                torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
-            optimizer.step()
-            optimizer.zero_grad(set_to_none=True)
-
-            if total_loss < 7.0 :
-                # print(f"train loss {total_loss}less than 7.0  ,set grad clip to {clip}")
-                grad_clip = clip
-            if i % eval_step != 0 :
-                total_loss = 0
-
-            output,loss = None,None
+    model = model.to(device)
+
+    grad_clip = 1.0 # setting gradient clipping to 1.0
+
+    for e in range(epoch) :
+        print(f'Training Epoch : {e}')
+        total_loss = 0
+        val_iter = iter(val_loader)
+        train_acc=[0,0]
+        train_num = 0
+    
+        total_acc = [0,0]
+        count= 0
+        for i , data in enumerate(train_loader) :
+    
+    
+            model.train()
+            img,label= data
+            img,label =img.to(device, non_blocking=True) ,label.to(device, non_blocking=True)
+    
+            output = model(img)
+    
+            loss = criterion(output,label) /accum_step
+    
+            temp_output ,temp_label = output.detach().to('cpu') , label.detach().to('cpu')
+            temp_acc = accuracy(temp_output,temp_label,(1,5))
+            train_acc=[train_acc[0]+temp_acc[0] , train_acc[1]+temp_acc[1]]
+            train_num+=batch_size
+            temp_output,temp_label,temp_acc = None,None,None
+    
+            loss.backward()
+            total_loss += loss.detach().to('cpu')
+            img,label=None,None
             torch.cuda.empty_cache()
-        if i>0 and i % eval_step == 0 :
-
-            print(f'train losss :{total_loss}')
-            temp_loss = total_loss
-            total_loss= 0
-
-            val_loss = 0
-            torch.cuda.empty_cache()
-
-            for j   in range(update_count) :
-                loss = None
-                print(f'Evaluation Steps Start')
-                try :
-                    img,label = next(val_iter)
-                except StopIteration :
-                    val_iter= iter(val_loader)
-                    img,label = next(val_iter)
-                with torch.no_grad():
-                    model.eval()
-
-                    img , label = img.to(device, non_blocking=True) , label.to(device, non_blocking=True)
-                    output = model(img)
-                    temp_output ,temp_label = output.detach().to('cpu') , label.detach().to('cpu')
-                    temp_acc = accuracy(temp_output,temp_label,(1,5))
-                    total_acc=[total_acc[0]+temp_acc[0] , total_acc[1]+temp_acc[1]]
-                    count+=batch_size
-
-                    loss = criterion(output,label)/accum_step
-                    val_loss += loss.detach().to('cpu')
-                    # loss.backward()
-                    torch.cuda.empty_cache()
-
-
-                    img,label,output ,loss= None,None,None,None
-
-
-
+            if i> 0 and i%update_count == 0 :
+                print(f'Training steps : {i}  parameter update loss :{total_loss} ')
+                if grad_clip is not None:
+                    torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
+                optimizer.step()
+                optimizer.zero_grad(set_to_none=True)
+    
+                if total_loss < 7.0 :
+                    # print(f"train loss {total_loss}less than 7.0  ,set grad clip to {clip}")
+                    grad_clip = clip
+                if i % eval_step != 0 :
+                    total_loss = 0
+    
+                output,loss = None,None
                 torch.cuda.empty_cache()
-
-            if abs(val_loss-temp_loss) > 0.03 :
-                grad_clip=clip
-                # print(f"val_loss {val_loss} - train_loss {temp_loss} = {abs(val_loss-temp_loss)} > 0.3")
-                # print(f"set grad clip to {grad_clip}")
-
-                best_val_loss = val_loss
-
-            val_loss = None
-        img,label,output = None,None,None
+            if i>0 and i % eval_step == 0 :
+    
+                print(f'train losss :{total_loss}')
+                temp_loss = total_loss
+                total_loss= 0
+    
+                val_loss = 0
+                torch.cuda.empty_cache()
+    
+                for j   in range(update_count) :
+                    loss = None
+                    print(f'Evaluation Steps Start')
+                    try :
+                        img,label = next(val_iter)
+                    except StopIteration :
+                        val_iter= iter(val_loader)
+                        img,label = next(val_iter)
+                    with torch.no_grad():
+                        model.eval()
+    
+                        img , label = img.to(device, non_blocking=True) , label.to(device, non_blocking=True)
+                        output = model(img)
+                        temp_output ,temp_label = output.detach().to('cpu') , label.detach().to('cpu')
+                        temp_acc = accuracy(temp_output,temp_label,(1,5))
+                        total_acc=[total_acc[0]+temp_acc[0] , total_acc[1]+temp_acc[1]]
+                        count+=batch_size
+    
+                        loss = criterion(output,label)/accum_step
+                        val_loss += loss.detach().to('cpu')
+                        # loss.backward()
+                        torch.cuda.empty_cache()
+    
+    
+                        img,label,output ,loss= None,None,None,None
+    
+    
+    
+                    torch.cuda.empty_cache()
+    
+                if abs(val_loss-temp_loss) > 0.03 :
+                    grad_clip=clip
+                    # print(f"val_loss {val_loss} - train_loss {temp_loss} = {abs(val_loss-temp_loss)} > 0.3")
+                    # print(f"set grad clip to {grad_clip}")
+    
+                    best_val_loss = val_loss
+    
+                val_loss = None
+            img,label,output = None,None,None
 
 
 

From c900f665698e6625151c569794fbb16f5d466f9b Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Tue, 17 Sep 2024 14:08:32 +0900
Subject: [PATCH 15/21] modify: VGG tutorial

train loader fail
---
 beginner_source/Pretraining_Vgg_from_scratch.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index ae066cd442..3059c038a6 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -1,5 +1,5 @@
 """
-``Pretraining`` VGG from scratch 
+Pretraining VGG from scratch 
 ============================
 
 
@@ -29,7 +29,12 @@
        * Complete the `Learn the Basics tutorials <https://pytorch.org/tutorials/beginner/basics/intro.html>`__
        * Familiarity with basic machine learning concepts and terms 
 
-If you are running this in Google Colab, install albumentations
+If you are running this in Google Colab, install ``albumentations`` by running the following command:
+
+.. code-block:: py
+
+
+    !pip install albumentations
 
 
 """
@@ -101,10 +106,10 @@
 # 
 # Unlike ``AlexNet``'s 5x5 9x9 filters, VGG only uses 3x3 filters. 
 # Using multiple 3x3 filters can obtain the same receptive field as using a 5x5 filter, but it is effective in reducing the number of parameters. 
-# In addition, since it passes through multiple nonlinear functions, the nonlinearity increases even more.
+# In addition, since it passes through multiple nonlinear functions, the ``nonlinearity`` increases even more.
 # 
 # VGG applied a max pooling layer after multiple convolutional layers to reduce the spatial size. 
-# This allowed the feature map to be downsampled while preserving important information. 
+# This allowed the feature map to be ``downsampled`` while preserving important information. 
 # Thanks to this, the network could learn high-dimensional features in deeper layers and prevent overfitting.
 
 ######################################################################
@@ -447,7 +452,7 @@ def accuracy(output, target, topk=(1,)):
 # we use ``CIFAR100`` .
 # 
 
-if DatasetName == 'Cifar' :
+if DatasetName == 'CIFAR' :
     train_data = Custom_Cifar(root=os.getcwd(),download=True)
     val_data  = Custom_Cifar(root=os.getcwd(),train=False,download=True)
     val_data.val= True

From 2db1099c54a04d9f0bd077851e71e1218f5ce2fd Mon Sep 17 00:00:00 2001
From: woongjoonchoi <oongjoon10@gmail.com>
Date: Tue, 17 Sep 2024 15:01:43 +0900
Subject: [PATCH 16/21] modify : VGG tutorial

feedback accepted

Re PullRequest because of network issue
---
 .../Pretraining_Vgg_from_scratch.py           | 77 +++----------------
 1 file changed, 10 insertions(+), 67 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index b96c99f4aa..0487908df2 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -1,9 +1,5 @@
 """
-<<<<<<< HEAD
-Pretraining VGG from scratch 
-=======
-Pretraining VGG from scratch
->>>>>>> origin/master
+Pre-training VGG from scratch
 ============================
 
 
@@ -37,26 +33,17 @@
 Overview
 ------------
 
-<<<<<<< HEAD
-If you are running this in Google Colab, install ``albumentations`` by running the following command:
-
-.. code-block:: py
-
-
-    !pip install albumentations
-=======
 ​​VGG is a model that attracted attention due to its ability to build deeper layers and dramatically
-shorten the training time compared to AlexNet, which was the state-of-the-art model at the time of the publishing
+shorten the training time compared to ``AlexNet``, which was the state-of-the-art model at the time of the publishing
 of the `original paper <https://arxiv.org/abs/1409.1556>`__.
->>>>>>> origin/master
 
-Unlike AlexNet's 5x5 and 9x9 filters, VGG uses only 3x3 filters. Using multiple 3x3 filters can
+Unlike ``AlexNet``'s 5x5 and 9x9 filters, VGG uses only 3x3 filters. Using multiple 3x3 filters can
 obtain the same receptive field as using a 5x5 filter, but it is effective in reducing the number
-of parameters. In addition, since it passes through multiple nonlinear functions, the
-nonlinearity increases even more.
+of parameters. In addition, since it passes through multiple non-linear functions, the
+non-linearity increases even more.
 
 VGG applies a max pooling layer after multiple convolutional layers to reduce the spatial size.
-This allows the feature map to be downsampled while preserving important information. Thanks
+This allows the feature map to be down-sampled while preserving important information. Thanks
 to this, the network can learn high-dimensional features in deeper layers and prevent overfitting.
 
 In this tutorial, we will train the VGG model from scratch using only the configuration presented
@@ -103,55 +90,11 @@
 
 
 ######################################################################
-<<<<<<< HEAD
-# Purpose point of this tutorial
-# ----------------------------
-# 
-
-
-######################################################################
-# -  We train the model from scratch using only the configuration
-#    presented in the paper.
-# 
-#    -  we do not use future method, like ``Batch normalization``,Adam , He
-#       initialization.
-# 
-# -  You can apply to ImageNet Data.
-# 
-#    -  If you can download the ImageNet Data(140GB), you can apply this
-#       tutorial to reproduce Original VGG.
-# 
-# -  You can learn VGG within the training time suggested in the paper.
-# 
-
-
-######################################################################
-# Background
-# -----------------------
-# 
-
-
-######################################################################
-# VGG became a model that attracted attention because it succeeded in
-# building deeper layers and dramatically shortening the training time
-# compared to ``AlexNet``, which was the SOTA model at the time.
-# 
-# Unlike ``AlexNet``'s 5x5 9x9 filters, VGG only uses 3x3 filters. 
-# Using multiple 3x3 filters can obtain the same receptive field as using a 5x5 filter, but it is effective in reducing the number of parameters. 
-# In addition, since it passes through multiple nonlinear functions, the ``nonlinearity`` increases even more.
-# 
-# VGG applied a max pooling layer after multiple convolutional layers to reduce the spatial size. 
-# This allowed the feature map to be ``downsampled`` while preserving important information. 
-# Thanks to this, the network could learn high-dimensional features in deeper layers and prevent overfitting.
-
-######################################################################
-=======
->>>>>>> origin/master
 # VGG Configuration
 # -----------------
 #
 # In this section, we will define configurations suggested in the VGG paper. 
-# We use the CIFAR100 dataset. The authors of the VGG paper scale images isotropically,
+# We use the CIFAR100 dataset. The authors of the VGG paper scale images ``isotropically``,
 # which means increasing the size of an image while maintaining its proportions,
 # preventing distortion and maintaining the consistency of the object.
 
@@ -207,7 +150,7 @@
 # --------------------
 # 
 # As mentioned above we use the CIFAR100 dataset in this tutorial. According to the VGG paper,
-# the authors scale the images isotropically to maintain their proportions. This method, known
+# the authors scale the images ``isotropically`` to maintain their proportions. This method, known
 # as isotropic scaling, increases the size of an image while preserving its aspect ratio,
 # thus avoiding distortion and maintaining object consistency. 
 #
@@ -421,7 +364,7 @@ def _init_weights(self,m):
 # Initializing Model Weights
 # ----------------------------
 #
-# ggIn the original VGG paper, the authors trained model A first and then
+# In the original VGG paper, the authors trained model A first and then
 # used its weights as a starting point for training other variants. However,
 # this approach can be time-consuming. The authors also mentioned using Xavier
 # initialization as an alternative to initializing with model A's weights,
@@ -702,7 +645,7 @@ def __getitem__(self, index: int) :
 # Conclusion
 # ----------
 # 
-# In this tutorial, we have successfully demonstrated how to pretrain the VGG model
+# In this tutorial, we have successfully demonstrated how to pre-train the VGG model
 # from scratch. The techniques and insights provided in this tutorial can serve as
 # a basis for reproducing and adapting other foundational models.
 # 

From a6e8ca2e10dc75a8917ad48bc91c2234bf00b489 Mon Sep 17 00:00:00 2001
From: Woongjun Choi <oongjoon10@gmail.com>
Date: Tue, 24 Sep 2024 10:33:28 +0900
Subject: [PATCH 17/21] Update index.rst

---
 index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/index.rst b/index.rst
index e74bab7712..b85a1df278 100644
--- a/index.rst
+++ b/index.rst
@@ -999,6 +999,7 @@ Additional Resources
    intermediate/spatial_transformer_tutorial
    beginner/vt_tutorial
    intermediate/tiatoolbox_tutorial
+
    beginner/Pretrainig_VGG_from_scratch
 .. toctree::
    :maxdepth: 2

From 7f58d3c7f5de7b8eeaf9d58426c615e41d9aef51 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 24 Sep 2024 08:40:00 -0700
Subject: [PATCH 18/21] Update index.rst

---
 index.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/index.rst b/index.rst
index b85a1df278..7edb8a3d29 100644
--- a/index.rst
+++ b/index.rst
@@ -1001,6 +1001,7 @@ Additional Resources
    intermediate/tiatoolbox_tutorial
 
    beginner/Pretrainig_VGG_from_scratch
+   
 .. toctree::
    :maxdepth: 2
    :includehidden:

From f11d7987405611ce15f5d2563b671c495cd80aaa Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Tue, 24 Sep 2024 08:40:29 -0700
Subject: [PATCH 19/21] Update index.rst

---
 index.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/index.rst b/index.rst
index 7edb8a3d29..6661d36bb0 100644
--- a/index.rst
+++ b/index.rst
@@ -999,7 +999,6 @@ Additional Resources
    intermediate/spatial_transformer_tutorial
    beginner/vt_tutorial
    intermediate/tiatoolbox_tutorial
-
    beginner/Pretrainig_VGG_from_scratch
    
 .. toctree::

From ef33eaaa1f241b5f3fc8e1db47b2733e650a4ee5 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 26 Sep 2024 09:18:09 -0700
Subject: [PATCH 20/21] Fix rendering

---
 index.rst | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/index.rst b/index.rst
index 6661d36bb0..d35a9622f6 100644
--- a/index.rst
+++ b/index.rst
@@ -999,7 +999,8 @@ Additional Resources
    intermediate/spatial_transformer_tutorial
    beginner/vt_tutorial
    intermediate/tiatoolbox_tutorial
-   beginner/Pretrainig_VGG_from_scratch
+   beginner/Pretraining_VGG_from_scratch
+   
    
 .. toctree::
    :maxdepth: 2

From d4bec3ab877cc855206f3f9646e1a49332c06a46 Mon Sep 17 00:00:00 2001
From: Svetlana Karslioglu <svekars@meta.com>
Date: Thu, 26 Sep 2024 09:18:18 -0700
Subject: [PATCH 21/21] Update beginner_source/Pretraining_Vgg_from_scratch.py

Co-authored-by: Joel Schlosser <75754324+jbschlosser@users.noreply.github.com>
---
 beginner_source/Pretraining_Vgg_from_scratch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/beginner_source/Pretraining_Vgg_from_scratch.py b/beginner_source/Pretraining_Vgg_from_scratch.py
index 0487908df2..2f96e5b8d0 100644
--- a/beginner_source/Pretraining_Vgg_from_scratch.py
+++ b/beginner_source/Pretraining_Vgg_from_scratch.py
@@ -289,8 +289,8 @@ def make_feature_extractor(cfg_c,cfg_k):
 
 class Model_vgg(nn.Module) :
     # def __init__(self,version , num_classes):
-    def __init__(self,conf_channels,conf_kernels , num_classes):
-        conv_5_out_w ,conv_5_out_h = 7,7
+    def __init__(self, conf_channels, conf_kernels, num_classes):
+        conv_5_out_w, conv_5_out_h = 7, 7
         conv_5_out_dim =512
         conv_1_by_1_1_outchannel = 4096
         conv_1_by_1_2_outchannel = 4096