diff --git a/brainscore_vision/models/temporal_model_AVID-CMA/__init__.py b/brainscore_vision/models/temporal_model_AVID-CMA/__init__.py new file mode 100644 index 000000000..91668400e --- /dev/null +++ b/brainscore_vision/models/temporal_model_AVID-CMA/__init__.py @@ -0,0 +1,17 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry["AVID-CMA-Kinetics400"] = lambda: commit_model("AVID-CMA-Kinetics400") +model_registry["AVID-CMA-Audioset"] = lambda: commit_model("AVID-CMA-Audioset") +model_registry["AVID-Kinetics400"] = lambda: commit_model("AVID-Kinetics400") +model_registry["AVID-Audioset"] = lambda: commit_model("AVID-Audioset") diff --git a/brainscore_vision/models/temporal_model_AVID-CMA/model.py b/brainscore_vision/models/temporal_model_AVID-CMA/model.py new file mode 100644 index 000000000..60d91f690 --- /dev/null +++ b/brainscore_vision/models/temporal_model_AVID-CMA/model.py @@ -0,0 +1,92 @@ +import yaml +import os + +import torch + +import avid_cma +from avid_cma.utils.logger import Logger +from avid_cma.utils import main_utils +from avid_cma.datasets import preprocessing + +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper +from brainscore_vision.model_helpers.s3 import load_weight_file + + +HOME = os.path.dirname(os.path.abspath(avid_cma.__file__)) + +def get_model(identifier): + + if identifier == 'AVID-CMA-Kinetics400': + cfg_path = os.path.join(HOME, "configs/main/avid-cma/kinetics/InstX-N1024-PosW-N64-Top32.yaml") + weight_path = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_AVID-CMA/AVID-CMA_Kinetics_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar", + version_id="yx9Pbq3SuNOOd4sX7csTolaHD1iTCx8y", + sha1="6efe4464ca654a56affff766acf24e89e6f3ffbf" + ) + + elif identifier == 'AVID-CMA-Audioset': + cfg_path = os.path.join(HOME, "configs/main/avid-cma/audioset/InstX-N1024-PosW-N64-Top32.yaml") + weight_path = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_AVID-CMA/AVID-CMA_Audioset_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar", + version_id="jSaZgbUohM0ZeoEUUKZiLBo6iz_v8VvQ", + sha1="9db5eba9aab6bdbb74025be57ab532df808fe3f6" + ) + + elif identifier == 'AVID-Kinetics400': + cfg_path = os.path.join(HOME, "configs/main/avid/kinetics/Cross-N1024.yaml") + weight_path = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_AVID-CMA/AVID_Kinetics_Cross-N1024_checkpoint.pth.tar", + version_id="XyKt0UOUFsuuyrl6ZREivK8FadRPx34u", + sha1="d3a04f856d29421ba8de37808593a3fad4d4794f" + ) + + elif identifier == 'AVID-Audioset': + cfg_path = os.path.join(HOME, "configs/main/avid/audioset/Cross-N1024.yaml") + weight_path = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_AVID-CMA/AVID_Audioset_Cross-N1024_checkpoint.pth.tar", + version_id="0Sxuhn8LsYXQC4FnPfJ7rw7uU6kDlKgc", + sha1="b48d8428a1a2526ccca070f810333df18bfce5fd" + ) + + else: + raise ValueError(f"Unknown model identifier: {identifier}") + + + cfg = yaml.safe_load(open(cfg_path)) + cfg['model']['args']['checkpoint'] = weight_path + logger = Logger() + + # Define model + model = main_utils.build_model(cfg['model'], logger) + + # take only video model + model = model.video_model + + # Define dataloaders + db_cfg = cfg['dataset'] + print(db_cfg) + + num_frames = int(db_cfg['video_clip_duration'] * db_cfg['video_fps']) + + _video_transform = preprocessing.VideoPrep_Crop_CJ( + resize=(256, 256), + crop=(db_cfg['crop_size'], db_cfg['crop_size']), + augment=False, + num_frames=num_frames, + pad_missing=True, + ) + + def video_transform(video): + frames = video.to_pil_imgs() + return _video_transform(frames) + + layer_activation_format = { + 'conv1': 'CTHW', + **{f"conv{i}x": 'CTHW' for i in range(2, 6)}, + } + + return PytorchWrapper(identifier, model, video_transform, fps=db_cfg['video_fps'], layer_activation_format=layer_activation_format) \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_AVID-CMA/requirements.txt b/brainscore_vision/models/temporal_model_AVID-CMA/requirements.txt new file mode 100644 index 000000000..47cc15207 --- /dev/null +++ b/brainscore_vision/models/temporal_model_AVID-CMA/requirements.txt @@ -0,0 +1,3 @@ +avid_cma @ git+https://github.com/YingtianDt/AVID-CMA.git +torch +torchvision \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_AVID-CMA/test.py b/brainscore_vision/models/temporal_model_AVID-CMA/test.py new file mode 100644 index 000000000..d775f732d --- /dev/null +++ b/brainscore_vision/models/temporal_model_AVID-CMA/test.py @@ -0,0 +1,18 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "AVID-CMA-Kinetics400", + "AVID-CMA-Audioset", + "AVID-Kinetics400", + "AVID-Audioset" +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_GDT/__init__.py b/brainscore_vision/models/temporal_model_GDT/__init__.py new file mode 100644 index 000000000..29d479d31 --- /dev/null +++ b/brainscore_vision/models/temporal_model_GDT/__init__.py @@ -0,0 +1,16 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry["GDT-Kinetics400"] = lambda: commit_model("GDT-Kinetics400") +model_registry["GDT-HowTo100M"] = lambda: commit_model("GDT-HowTo100M") +model_registry["GDT-IG65M"] = lambda: commit_model("GDT-IG65M") diff --git a/brainscore_vision/models/temporal_model_GDT/model.py b/brainscore_vision/models/temporal_model_GDT/model.py new file mode 100644 index 000000000..624a5b29b --- /dev/null +++ b/brainscore_vision/models/temporal_model_GDT/model.py @@ -0,0 +1,72 @@ +import torch + +from gdt_model.model import GDT +from gdt_model.video_transforms import clip_augmentation + +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper +from brainscore_vision.model_helpers.s3 import load_weight_file + + +def transform_video(video): + arr = video.to_numpy() + arr = torch.as_tensor(arr) + return clip_augmentation(arr) + + +def get_model(identifier): + + assert identifier.startswith("GDT-") + dataset = "-".join(identifier.split("-")[1:]) + + if dataset == "Kinetics400": + pth = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_GDT/gdt_K400.pth", + version_id="JpU_tnCzrbTejn6sOrQMk8eRsJ97yFgt", + sha1="7f12c60670346b1aab15194eb44c341906e1bca6" + ) + elif dataset == "IG65M": + pth = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_GDT/gdt_IG65M.pth", + version_id="R.NoD6VAbFbJdf8tg5jnXIWB3hQ8GlSD", + sha1="3dcee3af61691e1e7e47e4b115be6808f4ea8172" + ) + elif dataset == "HowTo100M": + pth = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_GDT/gdt_HT100M.pth", + version_id="BVRl9t_134PoKZCn9W54cyfkImCW2ioq", + sha1="a9a979c82e83b955794814923af736eb34e6f080" + ) + else: + raise ValueError(f"Unknown dataset: {dataset}") + + # Load model + model = GDT( + vid_base_arch="r2plus1d_18", + aud_base_arch="resnet9", + pretrained=False, + norm_feat=False, + use_mlp=False, + num_classes=256, + ) + + model = model.video_network # Remove audio network + + # Load weights + state_dict_ = torch.load(pth, map_location="cpu")['model'] + state_dict = {} + for k, v in list(state_dict_.items()): + if k.startswith("video_network."): + k = k[len("video_network."):] + state_dict[k] = v + model.load_state_dict(state_dict) + + layer_activation_format = { + "base.stem": "CTHW", + **{f"base.layer{i}": "CTHW" for i in range(1, 5)}, + # "base.fc": "C", # no fc + } + + return PytorchWrapper(identifier, model, transform_video, fps=30, layer_activation_format=layer_activation_format) \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_GDT/requirements.txt b/brainscore_vision/models/temporal_model_GDT/requirements.txt new file mode 100644 index 000000000..35ea5ddb9 --- /dev/null +++ b/brainscore_vision/models/temporal_model_GDT/requirements.txt @@ -0,0 +1,3 @@ +gdt_model @ git+https://github.com/YingtianDt/GDT.git +torch +torchvision \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_GDT/test.py b/brainscore_vision/models/temporal_model_GDT/test.py new file mode 100644 index 000000000..f3092c785 --- /dev/null +++ b/brainscore_vision/models/temporal_model_GDT/test.py @@ -0,0 +1,17 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "GDT-Kinetics400", + "GDT-HowTo100M", + "GDT-IG65M", +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/__init__.py b/brainscore_vision/models/temporal_model_S3D_text_video/__init__.py new file mode 100644 index 000000000..5b7c12472 --- /dev/null +++ b/brainscore_vision/models/temporal_model_S3D_text_video/__init__.py @@ -0,0 +1,14 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry["s3d-HowTo100M"] = lambda: commit_model("s3d-HowTo100M") diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/model.py b/brainscore_vision/models/temporal_model_S3D_text_video/model.py new file mode 100644 index 000000000..d463caf9b --- /dev/null +++ b/brainscore_vision/models/temporal_model_S3D_text_video/model.py @@ -0,0 +1,65 @@ +import torch +import numpy as np +from torchvision import transforms +from s3dg_howto100m import S3D + +from brainscore_vision.model_helpers.activations.temporal.model.pytorch import PytorchWrapper +from brainscore_vision.model_helpers.s3 import load_weight_file + + +img_transform = transforms.Compose([ + transforms.Resize((256, 256)), +]) + +def transform_video(video): + frames = video.to_numpy() / 255. + frames = torch.Tensor(frames) + frames = frames.permute(0, 3, 1, 2) + frames = img_transform(frames) + return frames.permute(1, 0, 2, 3) + + +def get_model(identifier="s3d-HowTo100M"): + inferencer_kwargs = { + "fps": 24, # common YouTube frame rate + "layer_activation_format": + { + "conv1": "CTHW", + "conv_2c": "CTHW", + "mixed_3c": "CTHW", + "mixed_4b": "CTHW", + "mixed_4d": "CTHW", + "mixed_4f": "CTHW", + "mixed_5c": "CTHW", + "fc": "C" + }, + } + process_output = None + + model_name = identifier + + model_pth = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_S3D_text_video/s3d_howto100m.pth", + version_id="hRp6I8bpwreIMUVL0H.zCdK0hqRggL7n", + sha1="31e99d2a1cd48f2259ca75e719ac82c8b751ea75" + ) + + dict_pth = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_S3D_text_video/s3d_dict.npy", + version_id="4NxVLe8DSL6Uue0F7e2rz8HZuOk.tkBI", + sha1="d368ff7d397ec8240f1f963b5efe8ff245bac35f" + ) + + # Instantiate the model + model = S3D(dict_pth, 512) + + # Load the model weights + model.load_state_dict(torch.load(model_pth)) + + wrapper = PytorchWrapper(identifier, model, transform_video, + process_output=process_output, + **inferencer_kwargs) + + return wrapper \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt b/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt new file mode 100644 index 000000000..73f27f3b6 --- /dev/null +++ b/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt @@ -0,0 +1 @@ +S3D_HowTo100M @ git+https://github.com/YingtianDt/S3D_HowTo100M \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/test.py b/brainscore_vision/models/temporal_model_S3D_text_video/test.py new file mode 100644 index 000000000..e6c7fdb18 --- /dev/null +++ b/brainscore_vision/models/temporal_model_S3D_text_video/test.py @@ -0,0 +1,15 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "s3d-HowTo100M", +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_SeLaVi/__init__.py b/brainscore_vision/models/temporal_model_SeLaVi/__init__.py new file mode 100644 index 000000000..68f5deecf --- /dev/null +++ b/brainscore_vision/models/temporal_model_SeLaVi/__init__.py @@ -0,0 +1,17 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry["SeLaVi-Kinetics400"] = lambda: commit_model("SeLaVi-Kinetics400") +model_registry["SeLaVi-Kinetics-Sound"] = lambda: commit_model("SeLaVi-Kinetics-Sound") +model_registry["SeLaVi-VGG-Sound"] = lambda: commit_model("SeLaVi-VGG-Sound") +model_registry["SeLaVi-AVE"] = lambda: commit_model("SeLaVi-AVE") diff --git a/brainscore_vision/models/temporal_model_SeLaVi/model.py b/brainscore_vision/models/temporal_model_SeLaVi/model.py new file mode 100644 index 000000000..d6c34eb02 --- /dev/null +++ b/brainscore_vision/models/temporal_model_SeLaVi/model.py @@ -0,0 +1,68 @@ +import torch + +from selavi.model import load_model +from selavi.video_transforms import clip_augmentation + +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper +from brainscore_vision.model_helpers.activations.temporal.utils import download_weight_file + + +def transform_video(video): + arr = video.to_numpy() + arr = torch.as_tensor(arr) + return clip_augmentation(arr) + + +def get_model(identifier): + + assert identifier.startswith("SeLaVi-") + dataset = "-".join(identifier.split("-")[1:]) + + if dataset == "Kinetics400": + model_name = "selavi_kinetics.pth" + num_classes = 400 + elif dataset == "Kinetics-Sound": + model_name = "selavi_kinetics_sound.pth" + num_classes = 32 + elif dataset == "VGG-Sound": + model_name = "selavi_vgg_sound.pth" + num_classes = 309 + elif dataset == "AVE": + model_name = "selavi_ave.pth" + num_classes = 28 + else: + raise ValueError(f"Unknown dataset: {dataset}") + + url = f"https://dl.fbaipublicfiles.com/selavi/{model_name}" + pth = download_weight_file(url, folder="temporal_model_SeLaVi") + + # Load model + model = load_model( + vid_base_arch="r2plus1d_18", + aud_base_arch="resnet9", + use_mlp=True, + num_classes=num_classes, + pretrained=False, + norm_feat=False, + use_max_pool=False, + headcount=10, + ) + + model = model.video_network # Remove audio network + + # Load weights + state_dict_ = torch.load(pth, map_location="cpu")['model'] + state_dict = {} + for k, v in list(state_dict_.items()): + if k.startswith("module.video_network."): + k = k[len("module.video_network."):] + state_dict[k] = v + model.load_state_dict(state_dict) + + layer_activation_format = { + "base.stem": "CTHW", + **{f"base.layer{i}": "CTHW" for i in range(1, 5)}, + # "base.fc": "C", # no fc + } + + return PytorchWrapper(identifier, model, transform_video, fps=30, layer_activation_format=layer_activation_format) \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_SeLaVi/requirements.txt b/brainscore_vision/models/temporal_model_SeLaVi/requirements.txt new file mode 100644 index 000000000..ecc9cdae6 --- /dev/null +++ b/brainscore_vision/models/temporal_model_SeLaVi/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +selavi @ git+https://github.com/YingtianDt/selavi.git \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_SeLaVi/test.py b/brainscore_vision/models/temporal_model_SeLaVi/test.py new file mode 100644 index 000000000..1ad8c439c --- /dev/null +++ b/brainscore_vision/models/temporal_model_SeLaVi/test.py @@ -0,0 +1,18 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "SeLaVi-Kinetics400", + "SeLaVi-Kinetics-Sound", + "SeLaVi-VGG-Sound", + "SeLaVi-AVE" +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/__init__.py b/brainscore_vision/models/temporal_model_VideoMAEv2/__init__.py new file mode 100644 index 000000000..1579fc0e4 --- /dev/null +++ b/brainscore_vision/models/temporal_model_VideoMAEv2/__init__.py @@ -0,0 +1,14 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + +model_registry["VideoMAE-V2-B"] = lambda: commit_model("VideoMAE-V2-B") +model_registry["VideoMAE-V2-G"] = lambda: commit_model("VideoMAE-V2-G") diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/model.py b/brainscore_vision/models/temporal_model_VideoMAEv2/model.py new file mode 100644 index 000000000..cb0f2f94c --- /dev/null +++ b/brainscore_vision/models/temporal_model_VideoMAEv2/model.py @@ -0,0 +1,117 @@ + +import numpy as np +import torch +from timm.models import create_model +from torchvision import transforms + +# NOTE: Do not comment `import models`, it is used to register models +from videomae_v2 import * # noqa: F401 + +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper +from brainscore_vision.model_helpers.s3 import load_weight_file + + +LAYER_SELECT_STEP = 2 + +def to_normalized_float_tensor(vid): + vid = torch.Tensor(vid.to_numpy()) + return vid.permute(3, 0, 1, 2).to(torch.float32) / 255 + +# NOTE: for those functions, which generally expect mini-batches, we keep them +# as non-minibatch so that they are applied as if they were 4d (thus image). +# this way, we only apply the transformation in the spatial domain +def resize(vid, size, interpolation='bilinear'): + # NOTE: using bilinear interpolation because we don't work on minibatches + # at this level + scale = None + if isinstance(size, int): + scale = float(size) / min(vid.shape[-2:]) + size = None + return torch.nn.functional.interpolate( + vid, + size=size, + scale_factor=scale, + mode=interpolation, + align_corners=False) + +class ToFloatTensorInZeroOne(object): + + def __call__(self, vid): + return to_normalized_float_tensor(vid) + + +class Resize(object): + + def __init__(self, size): + self.size = size + + def __call__(self, vid): + return resize(vid, self.size) + + +transform_video = transforms.Compose( + [ToFloatTensorInZeroOne(), + Resize((224, 224))]) + +def get_model(identifier): + + if identifier == "VideoMAE-V2-G": + model_name = "vit_giant_patch14_224" + pth = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_VideoMAEv2/vit_g_hybrid_pt_1200e.pth", + version_id="TxtkfbeMV105dzpzTwi0Kn5glnvQvIrq", + # sha1="9048f2bc0b0c7ba4d0e5228f3a7c0bef4dbaca69", + sha1="32126231526fe310a6aba20c16d0e6435f5f0bb8" + ) + num_blocks = 40 + feature_map_size = 16 + elif identifier == "VideoMAE-V2-B": + model_name = "vit_base_patch16_224" + pth = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_VideoMAEv2/vit_b_hybrid_pt_800e.pth", + version_id="rRjpYq21dAQ5KaCLbEHK.YaLZ_fbMPKw", + sha1="1e3602691964b1eb6f7c33529119243a5b235635" + ) + num_blocks = 12 + feature_map_size = 14 + + num_frames = 16 + + model = create_model(model_name) + + ckpt = torch.load(pth, map_location='cpu') + for model_key in ['model', 'module']: + if model_key in ckpt: + ckpt = ckpt[model_key] + break + + encoder_ckpt = {} + for k, v in ckpt.items(): + if k.startswith("encoder."): + encoder_ckpt[k[8:]] = v + + msg = model.load_state_dict(encoder_ckpt, strict=False) + print(msg) + + inferencer_kwargs = { + "fps": 6.25, + "layer_activation_format": { + "patch_embed": "THWC", + **{f"blocks.{i}": "THWC" for i in range(0, num_blocks, LAYER_SELECT_STEP)}, + # "head": "THWC" # weight not available + }, + "num_frames": num_frames, + } + + def process_activation(layer, layer_name, inputs, output): + B = output.shape[0] + C = output.shape[-1] + output = output.reshape(B, -1, feature_map_size, feature_map_size, C) + return output + + wrapper = PytorchWrapper(identifier, model, transform_video, + process_output=process_activation, + **inferencer_kwargs) + return wrapper diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/requirements.txt b/brainscore_vision/models/temporal_model_VideoMAEv2/requirements.txt new file mode 100644 index 000000000..5af2d95b6 --- /dev/null +++ b/brainscore_vision/models/temporal_model_VideoMAEv2/requirements.txt @@ -0,0 +1,4 @@ +torch +torchvision +timm +videomae_v2 @ git+https://github.com/YingtianDt/VideoMAEv2.git \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/test.py b/brainscore_vision/models/temporal_model_VideoMAEv2/test.py new file mode 100644 index 000000000..905296c3a --- /dev/null +++ b/brainscore_vision/models/temporal_model_VideoMAEv2/test.py @@ -0,0 +1,16 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "VideoMAE-V2-B", + "VideoMAE-V2-G", +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_mae_st/__init__.py b/brainscore_vision/models/temporal_model_mae_st/__init__.py new file mode 100644 index 000000000..6eb152a20 --- /dev/null +++ b/brainscore_vision/models/temporal_model_mae_st/__init__.py @@ -0,0 +1,15 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry["MAE-ST-L"] = lambda: commit_model("MAE-ST-L") +model_registry["MAE-ST-G"] = lambda: commit_model("MAE-ST-G") \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_mae_st/model.py b/brainscore_vision/models/temporal_model_mae_st/model.py new file mode 100644 index 000000000..fc8f399b5 --- /dev/null +++ b/brainscore_vision/models/temporal_model_mae_st/model.py @@ -0,0 +1,100 @@ +import torch +from iopath.common.file_io import g_pathmgr as pathmgr +from mae_st import models_vit +from mae_st.util import misc +from mae_st.util.pos_embed import interpolate_pos_embed + +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper +from brainscore_vision.model_helpers.s3 import load_weight_file + + +LAYER_SELECT_STEP = 2 +mean = (0.45, 0.45, 0.45) +std = (0.225, 0.225, 0.225) + +from torchvision import transforms + +transform_img = transforms.Compose([ + transforms.Resize((224, 224)), + transforms.Normalize(mean, std), +]) + + +def transform_video(video): + import torch + frames = torch.Tensor(video.to_numpy() / 255.0).permute(0, 3, 1, 2) + frames = transform_img(frames) + return frames.permute(1, 0, 2, 3) + + + +def get_model(identifier): + + if identifier == "MAE-ST-L": + model_name = "vit_large_patch16" + num_blocks = 24 + feature_map_size = 14 + load_path = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_mae_st/mae_pretrain_vit_large_k400.pth", + version_id="cPcP4AzpG95CimQ5Pn.CHKnGUJlLXM3m", + sha1="c7fb91864a4ddf8b99309440121a3abe66b846bb" + ) + + elif identifier == "MAE-ST-G": + model_name = "vit_huge_patch14" + num_blocks = 32 + feature_map_size = 16 + load_path = load_weight_file( + bucket="brainscore-vision", + relative_path="temporal_model_mae_st/mae_pretrain_vit_huge_k400.pth", + version_id="IYKa8QiocgBzo3EhsBouS62HboK6iqYT", + sha1="177e48577142ca01949c08254834ffa1198b9eb4" + ) + + num_frames = 16 + t_patch_size = 2 + + model = models_vit.__dict__[model_name]( + num_frames=num_frames, + t_patch_size=t_patch_size + ) + + with pathmgr.open(load_path, "rb") as f: + checkpoint = torch.load(f, map_location="cpu") + + print("Load pre-trained checkpoint from: %s" % load_path) + if "model" in checkpoint.keys(): + checkpoint_model = checkpoint["model"] + else: + checkpoint_model = checkpoint["model_state"] + # interpolate position embedding + interpolate_pos_embed(model, checkpoint_model) + + checkpoint_model = misc.convert_checkpoint(checkpoint_model) + + # load pre-trained model + msg = model.load_state_dict(checkpoint_model, strict=False) + print(msg) + + inferencer_kwargs = { + "fps": 6.25, + "layer_activation_format": { + "patch_embed": "THWC", + **{f"blocks.{i}": "THWC" for i in range(0, num_blocks, LAYER_SELECT_STEP)}, + # "head": "THWC" # weight not available + }, + "num_frames": num_frames, + } + + def process_activation(layer, layer_name, inputs, output): + B = output.shape[0] + C = output.shape[-1] + output = output.reshape(B, -1, feature_map_size, feature_map_size, C) + return output + + wrapper = PytorchWrapper(identifier, model, transform_video, + process_output=process_activation, + **inferencer_kwargs) + + return wrapper diff --git a/brainscore_vision/models/temporal_model_mae_st/requirements.txt b/brainscore_vision/models/temporal_model_mae_st/requirements.txt new file mode 100644 index 000000000..0d1858c8e --- /dev/null +++ b/brainscore_vision/models/temporal_model_mae_st/requirements.txt @@ -0,0 +1,3 @@ +mae_st @ git+https://github.com/YingtianDt/mae_st.git +torch +torchvision \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_mae_st/test.py b/brainscore_vision/models/temporal_model_mae_st/test.py new file mode 100644 index 000000000..0f2b2cb56 --- /dev/null +++ b/brainscore_vision/models/temporal_model_mae_st/test.py @@ -0,0 +1,16 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "MAE-ST-L", + "MAE-ST-G", +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_mmaction2/__init__.py b/brainscore_vision/models/temporal_model_mmaction2/__init__.py new file mode 100644 index 000000000..685183ff9 --- /dev/null +++ b/brainscore_vision/models/temporal_model_mmaction2/__init__.py @@ -0,0 +1,23 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry["I3D"] = lambda: commit_model("I3D") +model_registry["I3D-nonlocal"] = lambda: commit_model("I3D-nonlocal") +model_registry["SlowFast"] = lambda: commit_model("SlowFast") +model_registry["X3D"] = lambda: commit_model("X3D") +model_registry["TimeSformer"] = lambda: commit_model("TimeSformer") +model_registry["VideoSwin-B"] = lambda: commit_model("VideoSwin-B") +model_registry["VideoSwin-L"] = lambda: commit_model("VideoSwin-L") +model_registry["UniFormer-V1"] = lambda: commit_model("UniFormer-V1") +model_registry["UniFormer-V2-B"] = lambda: commit_model("UniFormer-V2-B") +model_registry["UniFormer-V2-L"] = lambda: commit_model("UniFormer-V2-L") diff --git a/brainscore_vision/models/temporal_model_mmaction2/mmaction2.csv b/brainscore_vision/models/temporal_model_mmaction2/mmaction2.csv new file mode 100644 index 000000000..9f253b57f --- /dev/null +++ b/brainscore_vision/models/temporal_model_mmaction2/mmaction2.csv @@ -0,0 +1,24 @@ +name,config,checkpoint,Kinetics400-top1,Kinetics400-top5,FLOPs,params,note +I3D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth,73.47,91.27,43.5G,28.0M, +I3D-nonlocal,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth,,,,, +TSM,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth,75.12,91.55,65.75G,23.87M, +SlowFast,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth,,,,, +X3D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/x3d/x3d_m_16x5x1_facebook-kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/x3d/facebook/x3d_m_16x5x1_facebook-kinetics400-rgb_20201027-3f42382a.pth,,,,, +TimeSformer,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth,,,,, +VideoSwin-B,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth,,,,, +VideoSwin-L,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth,,,,, +UniFormer-V1,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb_20221219-b776322c.pth,,,,, +UniFormer-V2-B,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth,,,,, +UniFormer-V2-L,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth,,,,, +VideoMAE-V2,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py,https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth,86.6,97.3,180G,87M, +VideoMAE-V1,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py,https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-860a3cd3.pth,81.3,95.0,180G,87M, +R2plus1D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb_20220812-4270588c.pth,75.46,92.28,213G,63.8M, +I3D-nonlocal,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth,74.80,92.07,59.3G,35.4M, +TSN,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth,74.12,91.34,102.7G,24.33M, +C3D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth,,,38.5G,78.4M, +UniFormer-V2,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth,,,0.1T,115M, +VideoSwin,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth,80.57,94.49,282G,88.0M, +C2D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth,73.44,91.00,33G,24.3M, +TSM-nonlocal,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb_20220831-108bfde5.pth,74.49,91.15,61.30G,31.68M, +CSN,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb_20220811-44395bae.pth,79.44,94.26,55.90G,13.13M, +TPN,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb_20220913-97d0835d.pth,74.20,91.48,,, \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_mmaction2/model.py b/brainscore_vision/models/temporal_model_mmaction2/model.py new file mode 100644 index 000000000..351158842 --- /dev/null +++ b/brainscore_vision/models/temporal_model_mmaction2/model.py @@ -0,0 +1,226 @@ +import os +import numpy as np + +import mmengine +import mmaction +from mmaction.apis import init_recognizer +from mmengine.registry import init_default_scope +from mmengine.dataset import Compose, pseudo_collate + +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper + + +LARGE_MODEL_LAYER_STEP = 2 +HOME = os.path.join(os.path.dirname(mmaction.__file__), "models") + + +class MMActionModelWrapper(PytorchWrapper): + meta = None + + def load_meta(self, path=os.path.join(os.path.dirname(__file__), "mmaction2.csv")): + if self.meta is None: + import pandas as pd + self.meta = pd.read_csv(path) + + def __init__(self, model_name, process_output=None, *args, **kwargs): + self.load_meta() + + _num_frames = None + num_frames = kwargs.get("num_frames") + if isinstance(num_frames, (list, tuple)): + if num_frames[0] == num_frames[1]: + _num_frames = num_frames + elif num_frames is not None: + _num_frames = num_frames + + model_data = self.meta[self.meta['name'] == model_name].iloc[0] # return a Series + config = model_data['config'] + checkpoint = model_data['checkpoint'] + config = config.replace("https://github.com/open-mmlab/mmaction2/blob/main/", "") + config_path = os.path.join(HOME, config) + config = mmengine.Config.fromfile(config_path) + + test_pipeline_cfg = config.test_pipeline + # SampleFrames: clip_len x frame_interval (sampling interval) x num_clips + # change every ThreeCrop and TenCrop to CenterCrop + for i, pipeline in enumerate(test_pipeline_cfg): + if pipeline['type'] in ['ThreeCrop', 'TenCrop']: + test_pipeline_cfg[i] = {'type': 'CenterCrop', 'crop_size': pipeline['crop_size']} + if pipeline['type'] in ['SampleFrames']: + test_pipeline_cfg[i].update({"num_clips": 1, 'frame_interval': 1}) + + model = init_recognizer(config, checkpoint, device="cpu") + init_default_scope(model.cfg.get('default_scope', 'mmaction')) + test_pipeline = Compose(test_pipeline_cfg[3:]) + + def transform_video(video): + imgs = video.to_numpy() + data = {'imgs': imgs, 'num_clips': 1, 'modality': 'RGB'} + if _num_frames is not None: + data['clip_len'] = _num_frames + assert len(imgs) == _num_frames + else: + data['clip_len'] = len(imgs) + + data = test_pipeline(data) + return data + + super().__init__(model_name, model, transform_video, process_output, *args, **kwargs) + + def forward(self, inputs): + data = pseudo_collate(inputs) + data["inputs"] = [d.to(self._device) for d in data["inputs"]] + result = self._model.test_step(data)[0] + return result + + +def get_model(identifier): + if identifier == "I3D": + process_output = None + inferencer_kwargs = { + "fps": 12.5, + "layer_activation_format": { + "backbone.conv1": "CTHW", # too large: (C: 64, T: *, H: 128, W: 128) + **{f"backbone.layer{i}": "CTHW" for i in range(1, 5)}, + "cls_head": "C", + }, + "num_frames": (5, np.inf), + } + + if identifier == "I3D-nonlocal": + process_output = None + inferencer_kwargs = { + "fps": 12.5, + "layer_activation_format": { + "backbone.conv1": "CTHW", # too large: (C: 64, T: *, H: 128, W: 128) + **{f"backbone.layer{i}": "CTHW" for i in range(1, 5)}, + "cls_head": "C", + }, + "num_frames": (5, np.inf), + } + + if identifier == "TSM": + process_output = None + inferencer_kwargs = { + "fps": 25, + "layer_activation_format": {}, + } + + if identifier == "SlowFast": + process_output = None + inferencer_kwargs = { + "fps": 12.5, + "layer_activation_format": { + "backbone.slow_path.conv1_lateral": "CTHW", + **{f"backbone.slow_path.layer{i}_lateral": "CTHW" for i in range(1, 4)}, + "cls_head": "C", + }, + "num_frames": 32, # TODO: in fact can be multiple of 4? + } + + if identifier == "X3D": + process_output = None + inferencer_kwargs = { + "fps": 30, + "layer_activation_format": { + "backbone.conv1_t": "CTHW", + **{f"backbone.layer{i}": "CTHW" for i in range(1, 5)}, + "cls_head": "C", + }, + } + + if identifier == "TimeSformer": + inferencer_kwargs = { + "fps": 8, + "layer_activation_format": { + "backbone.patch_embed": "CTHW", + **{f"backbone.transformer_layers.layers.{i}": "HWTC" for i in range(0, 12, LARGE_MODEL_LAYER_STEP)}, + "cls_head": "C", + }, + "num_frames": 8 + } + def process_output(layer, layer_name, inputs, output): + if layer_name == "backbone.patch_embed": + B = inputs[0].shape[0] + C = output.shape[-1] + output = output.reshape(B, -1, 14, 14, C) + if layer_name.startswith("backbone.transformer_layers.layers."): + output = output[:, 1:] + B = output.shape[0] + C = output.shape[-1] + output = output.reshape(B, 14, 14, -1, C) + return output + + if identifier in ["VideoSwin-B", "VideoSwin-L"]: + + transformer_layers = { + **{f"backbone.layers.0.blocks.{i}": "THWC" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)}, + **{f"backbone.layers.1.blocks.{i}": "THWC" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)}, + **{f"backbone.layers.2.blocks.{i}": "THWC" for i in range(0, 18, LARGE_MODEL_LAYER_STEP)}, + **{f"backbone.layers.3.blocks.{i}": "THWC" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)}, + } + + inferencer_kwargs = { + "fps": 12.5, + "layer_activation_format": { + "backbone.patch_embed": "CTHW", + **transformer_layers, + "cls_head": "C", + }, + } + process_output = None + + if identifier == "UniFormer-V1": + + transformer_layers = { + **{f"backbone.blocks1.{i}": "CTHW" for i in range(0, 5, LARGE_MODEL_LAYER_STEP)}, + **{f"backbone.blocks2.{i}": "CTHW" for i in range(0, 8, LARGE_MODEL_LAYER_STEP)}, + **{f"backbone.blocks3.{i}": "CTHW" for i in range(0, 20, LARGE_MODEL_LAYER_STEP)}, + **{f"backbone.blocks4.{i}": "CTHW" for i in range(0, 7, LARGE_MODEL_LAYER_STEP)}, + } + + inferencer_kwargs = { + "fps": 6.25, + "layer_activation_format": { + "backbone.pos_drop": "CTHW", + **transformer_layers, + "cls_head": "C", + }, + } + process_output = None + + if identifier.startswith("UniFormer-V2"): + + if identifier == "UniFormer-V2-B": + num_frames = 8 + num_transformer_layers = 12 + img_size = 14 + elif identifier == "UniFormer-V2-L": + num_frames = 32 + num_transformer_layers = 24 + img_size = 16 + + transformer_layers = { + **{f"backbone.transformer.resblocks.{i}": "HWTC" for i in range(0, num_transformer_layers, LARGE_MODEL_LAYER_STEP)}, + } + + inferencer_kwargs = { + "fps": 25, + "layer_activation_format": { + "backbone.conv1": "CTHW", + **transformer_layers, + "backbone": "C", + "cls_head": "C", + }, + "num_frames": num_frames + } + def process_output(layer, layer_name, inputs, output): + if layer_name.startswith("backbone.transformer.resblocks."): + T = inputs[1] + C = output.shape[-1] + output = output[1:] # remove the class token + output = output.reshape(img_size, img_size, -1, T, C).permute(2, 0, 1, 3, 4) # BHWTC + return output + + model = MMActionModelWrapper(identifier, process_output, **inferencer_kwargs) + return model \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_mmaction2/requirements.txt b/brainscore_vision/models/temporal_model_mmaction2/requirements.txt new file mode 100644 index 000000000..1caa728dd --- /dev/null +++ b/brainscore_vision/models/temporal_model_mmaction2/requirements.txt @@ -0,0 +1,5 @@ +importlib-metadata<5 +mmaction2 @ git+https://github.com/YingtianDt/mmaction2.git@533edc3 +mmengine +torch +torchvision \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_mmaction2/test.py b/brainscore_vision/models/temporal_model_mmaction2/test.py new file mode 100644 index 000000000..d539c623a --- /dev/null +++ b/brainscore_vision/models/temporal_model_mmaction2/test.py @@ -0,0 +1,24 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "I3D", + "I3D-nonlocal", + "SlowFast", + "X3D", + "TimeSformer", + "VideoSwin-B", + "VideoSwin-L", + "UniFormer-V1", + "UniFormer-V2-B", + "UniFormer-V2-L", +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_openstl/__init__.py b/brainscore_vision/models/temporal_model_openstl/__init__.py new file mode 100644 index 000000000..2b49cc845 --- /dev/null +++ b/brainscore_vision/models/temporal_model_openstl/__init__.py @@ -0,0 +1,19 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry["ConvLSTM"] = lambda: commit_model("ConvLSTM") +model_registry["PredRNN"] = lambda: commit_model("PredRNN") +# model_registry["PredNet"] = lambda: commit_model("PredNet") +model_registry["SimVP"] = lambda: commit_model("SimVP") +model_registry["TAU"] = lambda: commit_model("TAU") +model_registry["MIM"] = lambda: commit_model("MIM") diff --git a/brainscore_vision/models/temporal_model_openstl/model.py b/brainscore_vision/models/temporal_model_openstl/model.py new file mode 100644 index 000000000..aed3e0464 --- /dev/null +++ b/brainscore_vision/models/temporal_model_openstl/model.py @@ -0,0 +1,223 @@ +import os +import imp +import numpy as np +from collections import OrderedDict + +import torch +from torchvision import transforms +import openstl +from openstl.methods import method_maps +from openstl.utils import reshape_patch + +from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper +from brainscore_vision.model_helpers.activations.temporal.utils import download_weight_file + + +# We only use models trained on KITTI dataset, because it is the most ecological, +# diverse, challenging, and widely used dataset for next frame prediction among +# the datasets used by OpenSTL repo. +IMAGE_SIZES = (128, 160) # for KITTI +KITTI_CONFIG_DIR = os.path.join(os.path.dirname(openstl.__file__), "configs/kitticaltech") +KITTI_FPS = 10 # from paper: https://www.cvlibs.net/publications/Geiger2012CVPR.pdf +LARGE_MODEL_LAYER_STEP = 2 + +transform_image = transforms.Resize(IMAGE_SIZES) + + +class LSTMWrapper(PytorchWrapper): + def _register_hook(self, layer, layer_name, target_dict): + def hook_function(_layer, _input, output, name=layer_name, target_dict=target_dict): + output = self._process_activation(_layer, name, _input, output) + target_dict.setdefault(name, []).append(PytorchWrapper._tensor_to_numpy(output)) + + hook = layer.register_forward_hook(hook_function) + return hook + + def get_activations(self, inputs, layer_names): + self._model.eval() + layer_results = OrderedDict() + hooks = [] + + for layer_name in layer_names: + layer = self.get_layer(layer_name) + hook = self._register_hook(layer, layer_name, target_dict=layer_results) + hooks.append(hook) + + with torch.no_grad(): + self.forward(inputs) + + for hook in hooks: + hook.remove() + + # stack the T dim to be the second dim + for layer_name, activations in layer_results.items(): + layer_results[layer_name] = np.stack(activations, axis=1) + + return layer_results + + def forward(self, inputs): + tensor = torch.stack(inputs) + tensor = tensor.to(self._device) + return self._model(tensor, return_loss=False) + + +class MIMWrapper(LSTMWrapper): + def forward(self, inputs): + output = super().forward(inputs) + # clear MIMBlock.convlstm_c + def _clear_helper(module): + if hasattr(module, "convlstm_c"): + module.convlstm_c = None + for child in module.children(): + _clear_helper(child) + _clear_helper(self._model) + return output + + +def _get_config(name, parent_dir): + config = imp.load_source(name, os.path.join(parent_dir, f"{name}.py")).__dict__ + config = {k: v for k, v in config.items() if not k.startswith("__")} + return config + + +def get_model(identifier): + config = _get_config(identifier, KITTI_CONFIG_DIR) + config["method"] = config["method"].lower() + config['dataname'] = "kitticaltech" + config['dataname'] = "kitticaltech" + config['metrics'] = ['mse', 'mae'] # not in use, just to initialize the model + config['in_shape'] = [None, 3, *IMAGE_SIZES] + + if identifier == "PredRNN": + layer_activation_format = { + **{f"cell_list.{i}": "TCHW" for i in range(4)}, + "conv_last": "TCHW" + } + + def process_output(layer, layer_name, inputs, output): + if layer_name.startswith("cell_list"): + h, c, m = output + return m + else: + return output + + wrapper_cls = LSTMWrapper + kwargs = {} + weight_name = "kitticaltech_predrnn_one_ep100.pth" + + elif identifier == "PredNet": + layer_activation_format = { + **{f"layer{i}": "TCHW" for i in range(4)}, + "layer5": "TCHW" + } + + def process_output(layer, layer_name, inputs, output): + if layer_name.startswith("cell_list"): + h, c = output + return c + else: + return output + + wrapper_cls = LSTMWrapper + kwargs = {} + weight_name = "kitticaltech_prednet_one_ep100.pth" + + elif identifier == "ConvLSTM": + layer_activation_format = { + **{f"cell_list.{i}": "TCHW" for i in range(4)}, + "conv_last": "TCHW" + } + + def process_output(layer, layer_name, inputs, output): + if layer_name.startswith("cell_list"): + h, c = output + return c + else: + return output + + wrapper_cls = LSTMWrapper + kwargs = {} + weight_name = "kitticaltech_convlstm_one_ep100.pth" + + elif identifier in ["SimVP", "TAU"]: + num_frames = 10 + layer_activation_format = { + **{f"enc.enc.{i}": "TCHW" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)}, + **{f"hid.enc.{i}": "TCHW" for i in range(0, 6, LARGE_MODEL_LAYER_STEP)}, + **{f"dec.dec.{i}": "TCHW" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)}, + } + + config['in_shape'] = [num_frames, 3, *IMAGE_SIZES] + wrapper_cls = PytorchWrapper + kwargs = {"num_frames": num_frames} + + def process_output(layer, layer_name, inputs, output): + if layer_name.startswith("enc") or layer_name.startswith("dec"): + output = output.view(-1, num_frames, *output.shape[1:]) + elif layer_name.startswith("hid"): + output = output[:, None] # time-compressed layers + return output + if identifier == "SimVP": + weight_name = "kitticaltech_simvp_gsta_one_ep100.pth" + elif identifier == "TAU": + weight_name = "kitticaltech_tau_one_ep100.pth" + + elif identifier == "MIM": + layer_activation_format = { + **{f"stlstm_layer.{i}": "TCHW" for i in range(0, 4, LARGE_MODEL_LAYER_STEP)}, + **{f"stlstm_layer_diff.{i}": "TCHW" for i in range(0, 3, LARGE_MODEL_LAYER_STEP)}, + "conv_last": "TCHW" + } + + def process_output(layer, layer_name, inputs, output): + if layer_name.startswith("stlstm_layer."): + h, c, m = output + ret = m + elif layer_name.startswith("stlstm_layer_diff."): + h, c = output + ret = c + else: + ret = output + return ret + + wrapper_cls = MIMWrapper + kwargs = {} + weight_name = "kitticaltech_mim_one_ep100.pth" + + + model = method_maps[config["method"]](**config).model + url = f"https://github.com/chengtan9907/OpenSTL/releases/download/kitti-weights/{weight_name}" + weight_path = download_weight_file(url, folder="temporal_model_openstl") + model.load_state_dict(torch.load(weight_path, map_location="cpu")) + + def transform_video_lstm(video): + frames = torch.Tensor(video.to_numpy() / 255.0).permute(0, 3, 1, 2) + frames = transform_image(frames) + frames = frames.permute(0, 2, 3, 1)[None, :] # BTHWC + patch_size = config["patch_size"] + assert 5 == frames.ndim + batch_size, seq_length, img_height, img_width, num_channels = frames.shape + a = frames.reshape(batch_size, seq_length, + img_height//patch_size, patch_size, + img_width//patch_size, patch_size, + num_channels) + b = a.transpose(3, 4) + patches = b.reshape(batch_size, seq_length, + img_height//patch_size, + img_width//patch_size, + patch_size*patch_size*num_channels)[0] + return patches + + def transform_video_simvp(video): + frames = torch.Tensor(video.to_numpy() / 255.0).permute(0, 3, 1, 2) + frames = transform_image(frames) + return frames + + if identifier in ("PredRNN", "ConvLSTM", "MIM"): + transform_video = transform_video_lstm + else: + transform_video = transform_video_simvp + + return wrapper_cls(identifier, model, transform_video, fps=KITTI_FPS, + layer_activation_format=layer_activation_format, + process_output=process_output, **kwargs) \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_openstl/requirements.txt b/brainscore_vision/models/temporal_model_openstl/requirements.txt new file mode 100644 index 000000000..4d839c16f --- /dev/null +++ b/brainscore_vision/models/temporal_model_openstl/requirements.txt @@ -0,0 +1,3 @@ +torch +torchvision +openstl @ git+https://github.com/YingtianDt/OpenSTL.git \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_openstl/test.py b/brainscore_vision/models/temporal_model_openstl/test.py new file mode 100644 index 000000000..4d52b76ce --- /dev/null +++ b/brainscore_vision/models/temporal_model_openstl/test.py @@ -0,0 +1,20 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "ConvLSTM", + "PredRNN", + "PredNet", + "SimVP", + "TAU", + "MIM" +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_torchvision/__init__.py b/brainscore_vision/models/temporal_model_torchvision/__init__.py new file mode 100644 index 000000000..54820f87f --- /dev/null +++ b/brainscore_vision/models/temporal_model_torchvision/__init__.py @@ -0,0 +1,19 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers +from brainscore_vision.model_interface import BrainModel +from . import model + + +def commit_model(identifier): + activations_model=model.get_model(identifier) + layers=get_specified_layers(activations_model) + return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers) + + +model_registry['r3d_18'] = lambda: commit_model('r3d_18') +model_registry['r2plus1d_18'] = lambda: commit_model('r2plus1d_18') +model_registry['mc3_18'] = lambda: commit_model('mc3_18') +model_registry['s3d'] = lambda: commit_model('s3d') +model_registry['mvit_v1_b'] = lambda: commit_model('mvit_v1_b') +model_registry['mvit_v2_s'] = lambda: commit_model('mvit_v2_s') \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_torchvision/model.py b/brainscore_vision/models/temporal_model_torchvision/model.py new file mode 100644 index 000000000..30d96aba8 --- /dev/null +++ b/brainscore_vision/models/temporal_model_torchvision/model.py @@ -0,0 +1,92 @@ +import torch +import numpy as np +from torchvision import transforms +from torchvision.models import video as vid + +from brainscore_vision.model_helpers.activations.temporal.model.pytorch import PytorchWrapper + + +LARGE_MODEL_LAYER_STEP = 2 + +def get_transform_video(transform_img): + def transform_video(video): + frames = video.to_numpy() / 255. + frames = torch.Tensor(frames) + frames = frames.permute(0, 3, 1, 2) + frames = transform_img(frames) + return frames.permute(1, 0, 2, 3) + return transform_video + + +def get_model(identifier): + if identifier in ["r3d_18", "r2plus1d_18", "mc3_18"]: + img_transform = transforms.Compose([ + transforms.Resize((128, 171)), + transforms.CenterCrop(112), + transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]) + ]) + inferencer_kwargs = { + "fps": 25, + "layer_activation_format": + { + "stem": "CTHW", + **{f'layer{i}': "CTHW" for i in range(1, 5)}, + "avgpool": "CTHW", + "fc": "C" + }, + } + process_output = None + + elif identifier == "s3d": + img_transform = transforms.Compose([ + transforms.Resize((256, 256)), + transforms.CenterCrop(224), + transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]) + ]) + inferencer_kwargs = { + "fps": 15, + "num_frames": (13, np.inf), + "layer_activation_format": + { + **{f"features.{i}": "CTHW" for i in range(0, 16, LARGE_MODEL_LAYER_STEP)}, + "avgpool": "CTHW", + "classifier": "CTHW" + } + } + process_output = None + + elif identifier in ["mvit_v1_b", "mvit_v2_s"]: + img_transform = transforms.Compose([ + transforms.Resize((256, 256)), + transforms.CenterCrop(224), + transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989]) + ]) + inferencer_kwargs = { + "fps": 7.5, + "num_frames": 16, + "layer_activation_format": { + "conv_proj": "CTHW", + **{f"blocks.{i}": "THWC" for i in range(0, 16, LARGE_MODEL_LAYER_STEP)}, + "head": "C", + } + } + + def process_output(layer, layer_name, input, output): + if layer_name.startswith("blocks"): + output, thw = output + t, h, w = thw + output = output[:, 1:] # remove cls + b, n, c = output.shape + assert n == t*h*w + output = output.view(b, t, h, w, c) + return output + return output + + vid_transform = get_transform_video(img_transform) + model_name = identifier + model = getattr(vid, model_name)(weights="KINETICS400_V1") + wrapper = PytorchWrapper(identifier, model, vid_transform, + process_output=process_output, + **inferencer_kwargs) + + return wrapper \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_torchvision/requirements.txt b/brainscore_vision/models/temporal_model_torchvision/requirements.txt new file mode 100644 index 000000000..37f700a78 --- /dev/null +++ b/brainscore_vision/models/temporal_model_torchvision/requirements.txt @@ -0,0 +1,2 @@ +torch +torchvision \ No newline at end of file diff --git a/brainscore_vision/models/temporal_model_torchvision/test.py b/brainscore_vision/models/temporal_model_torchvision/test.py new file mode 100644 index 000000000..77486ad03 --- /dev/null +++ b/brainscore_vision/models/temporal_model_torchvision/test.py @@ -0,0 +1,20 @@ +import pytest + +from brainscore_vision import load_model + + +model_list = [ + "r3d_18", + "r2plus1d_18", + "mc3_18", + "s3d", + "mvit_v1_b", + "mvit_v2_s", +] + +@pytest.mark.private_access +@pytest.mark.memory_intense +@pytest.mark.parametrize("model_identifier", model_list) +def test_load(model_identifier): + model = load_model(model_identifier) + assert model is not None \ No newline at end of file