BYOL improvements

facebookresearch · Oct 17, 2021 · 9fabd5f · 9fabd5f
1 parent 6e3063d
commit 9fabd5f
Show file tree

Hide file tree

Showing 12 changed files with 127 additions and 635 deletions.
diff --git a/...gs/config/benchmark/linear_image_classification/imagenet1k/byol_transfer_in1k_linear.yaml b/...gs/config/benchmark/linear_image_classification/imagenet1k/byol_transfer_in1k_linear.yaml
@@ -22,6 +22,7 @@ config:
       TRANSFORMS:
         - name: RandomResizedCrop
           size: 224
+          interpolation: 3
         - name: RandomHorizontalFlip
         - name: ToTensor
         - name: Normalize
@@ -38,6 +39,7 @@ config:
       TRANSFORMS:
         - name: Resize
           size: 256
+          interpolation: 3
         - name: CenterCrop
           size: 224
         - name: ToTensor
@@ -82,7 +84,7 @@ config:
       PARAMS_FILE: "specify the model weights"
       STATE_DICT_KEY_NAME: classy_state_dict
     SYNC_BN_CONFIG:
-      CONVERT_BN_TO_SYNC_BN: True
+      CONVERT_BN_TO_SYNC_BN: False
       SYNC_BN_TYPE: apex
       GROUP_SIZE: 8
   LOSS:
@@ -93,22 +95,29 @@ config:
       name: sgd
       momentum: 0.9
       num_epochs: 80
+      weight_decay: 0
       nesterov: True
       regularize_bn: False
       regularize_bias: True
       param_schedulers:
         lr:
           auto_lr_scaling:
-            auto_scale: true
-            base_value: 0.4
+            # if set to True, learning rate will be scaled.
+            auto_scale: True
+            # base learning rate value that will be scaled.
+            base_value: 0.2
+            # batch size for which the base learning rate is specified. The current batch size
+            # is used to determine how to scale the base learning rate value.
+            # scaled_lr = ((batchsize_per_gpu * world_size) * base_value ) / base_lr_batch_size
             base_lr_batch_size: 256
-          name: multistep
-          values: [0.4, 0.3, 0.2, 0.1, 0.05]
-          milestones: [16, 32, 48, 64]
-          update_interval: epoch
+            # scaling_type can be set to "sqrt" to reduce the impact of scaling on the base value
+            scaling_type: "linear"
+          name: constant
+          update_interval: "epoch"
+          value: 0.2
   DISTRIBUTED:
     BACKEND: nccl
-    NUM_NODES: 8
+    NUM_NODES: 4
     NUM_PROC_PER_NODE: 8
     INIT_METHOD: tcp
     RUN_ID: auto

diff --git a/configs/config/pretrain/byol/byol_1node_resnet.yaml b/configs/config/pretrain/byol/byol_1node_resnet.yaml
diff --git a/configs/config/pretrain/byol/byol_8node_resnet.yaml b/configs/config/pretrain/byol/byol_8node_resnet.yaml
@@ -67,7 +67,7 @@ config:
       RESNETS:
         DEPTH: 50
         ZERO_INIT_RESIDUAL: True
-    HEAD:     
+    HEAD:
       PARAMS: [
         ["mlp", {"dims": [2048, 4096, 256], "use_relu": True, "use_bn": True}],
         ["mlp", {"dims": [256, 4096, 256], "use_relu": True, "use_bn": True}]
@@ -82,15 +82,16 @@ config:
       byol_loss:
         embedding_dim: 256
         momentum: 0.99
-  OPTIMIZER:   # from official BYOL implementation, deepmind-research/byol/configs/byol.py
+  OPTIMIZER:
       name: lars
-      trust_coefficient: 0.001
+      eta: 0.001
       weight_decay: 1.0e-6
       momentum: 0.9
       nesterov: False
       num_epochs: 300
       regularize_bn: False
-      regularize_bias: True
+      regularize_bias: False
+      exclude_bias_and_norm: True
       param_schedulers:
         lr:
           auto_lr_scaling:

diff --git a/configs/config/quick_1gpu_resnet50_byol.yaml b/configs/config/quick_1gpu_resnet50_byol.yaml