RoB distillation + JEPA evaluations (#284)

Summary: Pull Request resolved: fairinternal/ssl_scaling#284 Reviewed By: odelalleau Differential Revision: D42220017 Pulled By: QuentinDuval fbshipit-source-id: 742419aa859fdbe4bc80f1f9e9f4771fee0f41a2
facebookresearch · Dec 28, 2022 · 04788de · 04788de
1 parent 346114a
commit 04788de
Show file tree

Hide file tree

Showing 259 changed files with 13,408 additions and 791 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -55,7 +55,8 @@ install_classy_vision: &install_classy_vision
  working_directory: ~/
  command: |
  pip uninstall -y classy_vision
- pip install classy-vision@https://github.com/facebookresearch/ClassyVision/tarball/main
+ pip install classy-vision@https://github.com/facebookresearch/ClassyVision/tarball/4785d5ee19d3bcedd5b28c1eb51ea1f59188b54d
+
 
 setup_venv: &setup_venv
  - run:
@@ -151,7 +152,7 @@ jobs:
  # Cache the vissl_venv directory that contains dependencies
  - restore_cache:
  keys:
- - v8-cpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+ - v9-cpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
 
  - <<: *install_vissl_dep
  - <<: *install_augly
@@ -163,7 +164,7 @@ jobs:
  - save_cache:
  paths:
  - ~/vissl_venv
- key: v8-cpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
+ key: v9-cpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}
 
  - <<: *install_vissl
 
@@ -195,7 +196,7 @@ jobs:
  # Download and cache dependencies
  - restore_cache:
  keys:
- - v8-gpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}-{{ checksum "docker/common/install_apex.sh" }}
+ - v9-gpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}-{{ checksum "docker/common/install_apex.sh" }}
 
  - <<: *install_vissl_dep
  - <<: *install_classy_vision
@@ -210,7 +211,7 @@ jobs:
  - save_cache:
  paths:
  - ~/vissl_venv
- key: v8-gpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}-{{ checksum "docker/common/install_apex.sh" }}
+ key: v9-gpu-dependencies-{{ checksum "requirements.txt" }}-{{ checksum "setup.py" }}-{{ checksum "docker/common/install_apex.sh" }}
 
  - <<: *install_vissl
 

diff --git a/configs/config/benchmark/fulltune/imagenet1k/eval_resnet_8gpu_transfer_in1k_fulltune.yaml b/configs/config/benchmark/fulltune/imagenet1k/eval_resnet_8gpu_transfer_in1k_fulltune.yaml
@@ -50,9 +50,6 @@ config:
  TRAINER:
  TRAIN_STEP_NAME: standard_train_step
  MODEL:
- FEATURE_EVAL_SETTINGS:
- EVAL_MODE_ON: True
- EVAL_TRUNK_AND_HEAD: False
  TRUNK:
  NAME: resnet
  RESNETS:

diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/mobilenet_v3_timm.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/mobilenet_v3_timm.yaml
@@ -0,0 +1,14 @@
+# @package _global_
+config:
+ MODEL:
+ TRUNK:
+ NAME: mobilenetv3_timm
+ MOBILE_NET:
+ NAME: mobilenetv3_large_100
+ TRUNK_ONLY: True
+ HEAD:
+ PARAMS: [
+ ["mobilenet_v3_head_timm", {"num_classes": 1000}],
+ ]
+ OPTIMIZER:
+ regularize_bn: True
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/mobilenet_v3_tv.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/mobilenet_v3_tv.yaml
@@ -0,0 +1,12 @@
+# @package _global_
+config:
+ MODEL:
+ TRUNK:
+ NAME: mobilenetv3_tv
+ MOBILE_NET:
+ NAME: mobilenetv3_large_100
+ TIMM_BN: False
+ HEAD:
+ PARAMS: [
+ ["mobilenet_v3_head", {"num_classes": 1000}],
+ ]
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/resnet18_eval_mlp.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/resnet18_eval_mlp.yaml
@@ -0,0 +1,9 @@
+# @package _global_
+config:
+ MODEL:
+ TRUNK:
+ NAME: resnet
+ RESNETS:
+ DEPTH: 18
+ HEAD:
+ PARAMS: [['eval_mlp', {'in_channels': 512, 'dims': [512, 1000]}]]
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/resnet34_eval_mlp.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/resnet34_eval_mlp.yaml
@@ -0,0 +1,9 @@
+# @package _global_
+config:
+ MODEL:
+ TRUNK:
+ NAME: resnet
+ RESNETS:
+ DEPTH: 34
+ HEAD:
+ PARAMS: [['eval_mlp', {'in_channels': 512, 'dims': [512, 1000]}]]
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/resnext50_eval_mlp.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/resnext50_eval_mlp.yaml
@@ -0,0 +1,9 @@
+# @package _global_
+config:
+ MODEL:
+ TRUNK:
+ NAME: resnet
+ RESNETS:
+ DEPTH: 50
+ HEAD:
+ PARAMS: [['eval_mlp', {'in_channels': 2048, 'dims': [2048, 1000]}]]
diff --git a/configs/config/benchmark/fulltune/imagenet1k/models/vit_tiny_cls4_eval_mlp.yaml b/configs/config/benchmark/fulltune/imagenet1k/models/vit_tiny_cls4_eval_mlp.yaml
@@ -0,0 +1,27 @@
+# @package _global_
+config:
+ MODEL:
+ FEATURE_EVAL_SETTINGS:
+ EVAL_MODE_ON: True
+ FREEZE_TRUNK_AND_HEAD: True
+ LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+ ["concatCLS4", ["Identity", []] ],
+ ]
+ TRUNK: # Tiny
+ NAME: vision_transformer
+ VISION_TRANSFORMERS:
+ IMAGE_SIZE: 224
+ PATCH_SIZE: 16
+ NUM_LAYERS: 12
+ NUM_HEADS: 3
+ HIDDEN_DIM: 192
+ MLP_DIM: 768
+ CLASSIFIER: token
+ DROPOUT_RATE: 0
+ ATTENTION_DROPOUT_RATE: 0
+ QKV_BIAS: True
+ DROP_PATH_RATE: 0.0
+ HEAD:
+ PARAMS: [
+ ["eval_mlp", {"in_channels": 768, "dims": [768, 1000]}],
+ ]
diff --git a/configs/config/benchmark/linear_image_classification/cifar100/models/beit_vit_l16.yaml b/configs/config/benchmark/linear_image_classification/cifar100/models/beit_vit_l16.yaml
@@ -1,29 +1,5 @@
 # @package _global_
 config:
- DATA:
- TRAIN:
- BATCHSIZE_PER_REPLICA: 32
- TRANSFORMS:
- - name: RandomResizedCrop
- size: 224
- interpolation: 3
- - name: RandomHorizontalFlip
- - name: ToTensor
- - name: Normalize
- mean: [0.485, 0.456, 0.406]
- std: [0.229, 0.224, 0.225]
- TEST:
- BATCHSIZE_PER_REPLICA: 32
- TRANSFORMS:
- - name: Resize
- size: 256
- interpolation: 3
- - name: CenterCrop
- size: 224
- - name: ToTensor
- - name: Normalize
- mean: [0.485, 0.456, 0.406]
- std: [0.229, 0.224, 0.225]
  MODEL:
  FEATURE_EVAL_SETTINGS:
  LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
@@ -54,9 +30,5 @@ config:
  ["eval_mlp", {"in_channels": 4096, "dims": [4096, 100]}],
  ["eval_mlp", {"in_channels": 1024, "dims": [1024, 100]}],
  ]
- WEIGHTS_INIT:
- PARAMS_FILE: "manifold://ssl_framework/tree/gfsai-bistro2-east/ai-group/users/prigoyal/vissl/oss_beit_large_patch16_224_pt22k.pth"
- APPEND_PREFIX: trunk.base_model.
- STATE_DICT_KEY_NAME: 'model'
  OPTIMIZER:
  regularize_bn: True
diff --git a/configs/config/benchmark/linear_image_classification/cifar100/models/mobilenet_v3_timm.yaml b/configs/config/benchmark/linear_image_classification/cifar100/models/mobilenet_v3_timm.yaml
@@ -0,0 +1,20 @@
+# @package _global_
+config:
+ MODEL:
+ FEATURE_EVAL_SETTINGS:
+ LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+ ["flatten", ["Identity", []] ],
+ ["flatten", ["Identity", []] ],
+ ]
+ TRUNK:
+ NAME: mobilenetv3_timm
+ MOBILE_NET:
+ NAME: mobilenetv3_large_100
+ PRETRAINED: False
+ HEAD:
+ PARAMS: [
+ ["eval_mlp", {"in_channels": 1280, "dims": [1280, 100]}],
+ ["mlp", {"dims": [1280, 100]}],
+ ]
+ OPTIMIZER:
+ regularize_bn: True
diff --git a/configs/config/benchmark/linear_image_classification/cifar100/models/mobilenet_v3_tv.yaml b/configs/config/benchmark/linear_image_classification/cifar100/models/mobilenet_v3_tv.yaml
@@ -0,0 +1,105 @@
+# @package _global_
+config:
+ MODEL:
+ FEATURE_EVAL_SETTINGS:
+ EVAL_MODE_ON: True
+ FREEZE_TRUNK_ONLY: True
+ SHOULD_FLATTEN_FEATS: True
+ LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+ # Linear heads on top of normalized or not representations
+ ["trunk_pool", ["Identity", []] ],
+ ["trunk_pool", ["Identity", []] ],
+ ["trunk_pool", ["Identity", []] ],
+
+ # MobileNet head on top of normalized or not representations
+ ["trunk_pool", ["Identity", []] ],
+ ["trunk_pool", ["Identity", []] ],
+ ["trunk_pool", ["Identity", []] ],
+ # ["trunk_pool", ["Identity", []] ],
+ # ["trunk_pool", ["Identity", []] ],
+
+ # Exploring a two layer head
+ ["trunk_pool", ["Identity", []] ],
+ ["trunk_pool", ["Identity", []] ],
+ ["trunk_pool", ["Identity", []] ],
+
+ # Combining several levels of representations
+ ["trunk", ["AdaptiveAvgPool2d", [[2, 1]]]],
+ ["trunk", ["AdaptiveAvgPool2d", [[2, 1]]]],
+ ["trunk", ["AdaptiveAvgPool2d", [[2, 1]]]],
+ ["trunk", ["AdaptiveAvgPool2d", [[2, 2]]]],
+ ["trunk", ["AdaptiveAvgPool2d", [[2, 2]]]],
+ ["trunk", ["AdaptiveAvgPool2d", [[2, 2]]]],
+ ]
+ TRUNK:
+ NAME: mobilenetv3_tv
+ MOBILE_NET:
+ NAME: mobilenetv3_large_100
+ PRETRAINED: False
+ HEAD:
+ PARAMS: [
+ # Linear heads on top of normalized or not representations
+ ["eval_mlp", {"in_channels": 960, "dims": [960, 100]}],
+ ["eval_mlp", {"in_channels": 960, "dims": [960, 100]}],
+ ["eval_mlp", {"in_channels": 960, "dims": [960, 100]}],
+
+ # MobileNet head on top of normalized or not representations
+ ["mobilenet_v3_head", {"with_bn": True, "num_classes": 100}],
+ ["mobilenet_v3_head", {"with_bn": True, "num_classes": 100}],
+ ["mobilenet_v3_head", {"with_bn": True, "num_classes": 100}],
+ # ["mobilenet_v3_head", {"with_bn": True, "drop_out": 0.1, "num_classes": 100}],
+ # ["mobilenet_v3_head", {"with_bn": True, "drop_out": 0.0, "num_classes": 100}],
+
+ # Exploring a two layers head
+ ["eval_mlp", {"in_channels": 960, "dims": [960, 1280, 100]}],
+ ["eval_mlp", {"in_channels": 960, "dims": [960, 1280, 100]}],
+ ["eval_mlp", {"in_channels": 960, "dims": [960, 1280, 100]}],
+
+ # Combining several levels of representations
+ ["eval_mlp", {"in_channels": 1920, "dims": [1920, 100]}],
+ ["eval_mlp", {"in_channels": 1920, "dims": [1920, 100]}],
+ ["eval_mlp", {"in_channels": 1920, "dims": [1920, 100]}],
+ ["eval_mlp", {"in_channels": 3840, "dims": [3840, 100]}],
+ ["eval_mlp", {"in_channels": 3840, "dims": [3840, 100]}],
+ ["eval_mlp", {"in_channels": 3840, "dims": [3840, 100]}],
+ ]
+ OPTIMIZER:
+ name: sgd
+ # In the OSS Caffe2 benchmark, RN50 models use 1e-4 and AlexNet models 5e-4
+ weight_decay: 0.0005
+ momentum: 0.9
+ num_epochs: 28
+ nesterov: True
+ regularize_bn: True
+ regularize_bias: True
+ param_schedulers:
+ lr:
+ auto_lr_scaling:
+ auto_scale: true
+ base_value: 0.01
+ base_lr_batch_size: 256
+ name: multistep
+ values: [0.01, 0.001, 0.0001, 0.00001]
+ milestones: [8, 16, 24]
+ update_interval: epoch
+ param_group_constructor: linear_eval_heads
+ linear_eval_heads:
+ # Linear heads on top of normalized or not representations
+ - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": True}
+ - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": False}
+ - {"lr": 1.0, "weight_decay": 0.0}
+ # MobileNet head on top of normalized or not representations
+ - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": True}
+ - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": False}
+ - {"lr": 1.0, "weight_decay": 0.0}
+ # Exploring a two layers head
+ - {"lr": 1.0, "weight_decay": 0.0005}
+ - {"lr": 1.0, "weight_decay": 0.0001}
+ - {"lr": 1.0, "weight_decay": 0.0}
+ # Combining several levels of representations
+ - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": True}
+ - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": False}
+ - {"lr": 1.0, "weight_decay": 0.0}
+ - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": True}
+ - {"lr": 1.0, "weight_decay": 0.0005, "regularize_bn": False}
+ - {"lr": 1.0, "weight_decay": 0.0}
diff --git a/configs/config/benchmark/linear_image_classification/cifar100/models/vit_g16_no_cls.yaml b/configs/config/benchmark/linear_image_classification/cifar100/models/vit_g16_no_cls.yaml
@@ -0,0 +1,54 @@
+# @package _global_
+config:
+ DATA:
+ TRAIN:
+ TRANSFORMS:
+ - name: RandomResizedCrop
+ size: 224
+ interpolation: 3
+ - name: RandomHorizontalFlip
+ - name: ToTensor
+ - name: Normalize
+ mean: [0.485, 0.456, 0.406]
+ std: [0.229, 0.224, 0.225]
+ TEST:
+ TRANSFORMS:
+ - name: Resize
+ size: 256
+ interpolation: 3
+ - name: CenterCrop
+ size: 224
+ - name: ToTensor
+ - name: Normalize
+ mean: [0.485, 0.456, 0.406]
+ std: [0.229, 0.224, 0.225]
+ MODEL:
+ FEATURE_EVAL_SETTINGS:
+ LINEAR_EVAL_FEAT_POOL_OPS_MAP: [
+ ["concatPOOL4", ["Identity", []] ],
+ ["lastPOOL", ["Identity", []] ],
+ ["concatPOOL4", ["Identity", []] ],
+ ["lastPOOL", ["Identity", []] ],
+ ]
+ TRUNK: # L-16
+ NAME: vision_transformer
+ VISION_TRANSFORMERS:
+ IMAGE_SIZE: 224
+ PATCH_SIZE: 16
+ NUM_LAYERS: 40
+ NUM_HEADS: 16
+ HIDDEN_DIM: 1408
+ MLP_DIM: 6144
+ DROPOUT_RATE: 0.0
+ ATTENTION_DROPOUT_RATE: 0.0
+ CLASSIFIER: token
+ QKV_BIAS: True
+ DROP_PATH_RATE: 0.0
+ USE_CLASS_TOKEN: False
+ HEAD:
+ PARAMS: [
+ ["eval_mlp", {"in_channels": 5632, "dims": [5632, 100]}],
+ ["eval_mlp", {"in_channels": 1408, "dims": [1408, 100]}],
+ ["mlp", {"dims": [5632, 100]}],
+ ["mlp", {"dims": [1408, 100]}],
+ ]