diff --git a/brainscore_vision/models/temporal_model_AVID-CMA/__init__.py b/brainscore_vision/models/temporal_model_AVID-CMA/__init__.py
new file mode 100644
index 000000000..91668400e
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_AVID-CMA/__init__.py
@@ -0,0 +1,17 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry["AVID-CMA-Kinetics400"] = lambda: commit_model("AVID-CMA-Kinetics400")
+model_registry["AVID-CMA-Audioset"] = lambda: commit_model("AVID-CMA-Audioset")
+model_registry["AVID-Kinetics400"] = lambda: commit_model("AVID-Kinetics400")
+model_registry["AVID-Audioset"] = lambda: commit_model("AVID-Audioset")
diff --git a/brainscore_vision/models/temporal_model_AVID-CMA/model.py b/brainscore_vision/models/temporal_model_AVID-CMA/model.py
new file mode 100644
index 000000000..60d91f690
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_AVID-CMA/model.py
@@ -0,0 +1,92 @@
+import yaml
+import os
+
+import torch
+
+import avid_cma
+from avid_cma.utils.logger import Logger
+from avid_cma.utils import main_utils
+from avid_cma.datasets import preprocessing
+
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
+from brainscore_vision.model_helpers.s3 import load_weight_file
+
+
+HOME = os.path.dirname(os.path.abspath(avid_cma.__file__))
+
+def get_model(identifier):
+    
+    if identifier == 'AVID-CMA-Kinetics400':
+        cfg_path = os.path.join(HOME, "configs/main/avid-cma/kinetics/InstX-N1024-PosW-N64-Top32.yaml")
+        weight_path = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_AVID-CMA/AVID-CMA_Kinetics_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar",
+            version_id="yx9Pbq3SuNOOd4sX7csTolaHD1iTCx8y",
+            sha1="6efe4464ca654a56affff766acf24e89e6f3ffbf"
+        )
+
+    elif identifier == 'AVID-CMA-Audioset':
+        cfg_path = os.path.join(HOME, "configs/main/avid-cma/audioset/InstX-N1024-PosW-N64-Top32.yaml")
+        weight_path = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_AVID-CMA/AVID-CMA_Audioset_InstX-N1024-PosW-N64-Top32_checkpoint.pth.tar",
+            version_id="jSaZgbUohM0ZeoEUUKZiLBo6iz_v8VvQ",
+            sha1="9db5eba9aab6bdbb74025be57ab532df808fe3f6"
+        )
+
+    elif identifier == 'AVID-Kinetics400':
+        cfg_path = os.path.join(HOME, "configs/main/avid/kinetics/Cross-N1024.yaml")
+        weight_path = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_AVID-CMA/AVID_Kinetics_Cross-N1024_checkpoint.pth.tar",
+            version_id="XyKt0UOUFsuuyrl6ZREivK8FadRPx34u",
+            sha1="d3a04f856d29421ba8de37808593a3fad4d4794f"
+        )
+
+    elif identifier == 'AVID-Audioset':
+        cfg_path = os.path.join(HOME, "configs/main/avid/audioset/Cross-N1024.yaml")
+        weight_path = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_AVID-CMA/AVID_Audioset_Cross-N1024_checkpoint.pth.tar",
+            version_id="0Sxuhn8LsYXQC4FnPfJ7rw7uU6kDlKgc",
+            sha1="b48d8428a1a2526ccca070f810333df18bfce5fd"
+        )
+
+    else:
+        raise ValueError(f"Unknown model identifier: {identifier}")
+
+
+    cfg = yaml.safe_load(open(cfg_path))
+    cfg['model']['args']['checkpoint'] = weight_path
+    logger = Logger()
+
+    # Define model
+    model = main_utils.build_model(cfg['model'], logger)
+    
+    # take only video model
+    model = model.video_model
+
+    # Define dataloaders
+    db_cfg = cfg['dataset']
+    print(db_cfg)
+
+    num_frames = int(db_cfg['video_clip_duration'] * db_cfg['video_fps'])
+
+    _video_transform = preprocessing.VideoPrep_Crop_CJ(
+        resize=(256, 256),
+        crop=(db_cfg['crop_size'], db_cfg['crop_size']),
+        augment=False,
+        num_frames=num_frames,
+        pad_missing=True,
+    )
+
+    def video_transform(video):
+        frames = video.to_pil_imgs()
+        return _video_transform(frames)
+    
+    layer_activation_format = {
+        'conv1': 'CTHW',
+        **{f"conv{i}x": 'CTHW' for i in range(2, 6)},
+    }
+    
+    return PytorchWrapper(identifier, model, video_transform, fps=db_cfg['video_fps'], layer_activation_format=layer_activation_format)
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_AVID-CMA/requirements.txt b/brainscore_vision/models/temporal_model_AVID-CMA/requirements.txt
new file mode 100644
index 000000000..47cc15207
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_AVID-CMA/requirements.txt
@@ -0,0 +1,3 @@
+avid_cma @ git+https://github.com/YingtianDt/AVID-CMA.git
+torch
+torchvision
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_AVID-CMA/test.py b/brainscore_vision/models/temporal_model_AVID-CMA/test.py
new file mode 100644
index 000000000..d775f732d
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_AVID-CMA/test.py
@@ -0,0 +1,18 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "AVID-CMA-Kinetics400",
+    "AVID-CMA-Audioset",
+    "AVID-Kinetics400",
+    "AVID-Audioset"
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_GDT/__init__.py b/brainscore_vision/models/temporal_model_GDT/__init__.py
new file mode 100644
index 000000000..29d479d31
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_GDT/__init__.py
@@ -0,0 +1,16 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry["GDT-Kinetics400"] = lambda: commit_model("GDT-Kinetics400")
+model_registry["GDT-HowTo100M"] = lambda: commit_model("GDT-HowTo100M")
+model_registry["GDT-IG65M"] = lambda: commit_model("GDT-IG65M")
diff --git a/brainscore_vision/models/temporal_model_GDT/model.py b/brainscore_vision/models/temporal_model_GDT/model.py
new file mode 100644
index 000000000..624a5b29b
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_GDT/model.py
@@ -0,0 +1,72 @@
+import torch
+
+from gdt_model.model import GDT
+from gdt_model.video_transforms import clip_augmentation
+
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
+from brainscore_vision.model_helpers.s3 import load_weight_file
+
+
+def transform_video(video):
+    arr = video.to_numpy()
+    arr = torch.as_tensor(arr)
+    return clip_augmentation(arr)
+
+
+def get_model(identifier):
+
+    assert identifier.startswith("GDT-")
+    dataset = "-".join(identifier.split("-")[1:])
+
+    if dataset == "Kinetics400":
+        pth = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_GDT/gdt_K400.pth",
+            version_id="JpU_tnCzrbTejn6sOrQMk8eRsJ97yFgt",
+            sha1="7f12c60670346b1aab15194eb44c341906e1bca6"
+        )
+    elif dataset == "IG65M":
+        pth = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_GDT/gdt_IG65M.pth",
+            version_id="R.NoD6VAbFbJdf8tg5jnXIWB3hQ8GlSD",
+            sha1="3dcee3af61691e1e7e47e4b115be6808f4ea8172"
+        )
+    elif dataset == "HowTo100M":
+        pth = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_GDT/gdt_HT100M.pth",
+            version_id="BVRl9t_134PoKZCn9W54cyfkImCW2ioq",
+            sha1="a9a979c82e83b955794814923af736eb34e6f080"
+        )
+    else:
+        raise ValueError(f"Unknown dataset: {dataset}")
+
+    # Load model
+    model = GDT(
+        vid_base_arch="r2plus1d_18", 
+        aud_base_arch="resnet9",
+        pretrained=False, 
+        norm_feat=False, 
+        use_mlp=False,
+        num_classes=256, 
+    )
+
+    model = model.video_network  # Remove audio network
+
+    # Load weights
+    state_dict_ = torch.load(pth, map_location="cpu")['model']
+    state_dict = {}
+    for k, v in list(state_dict_.items()):
+        if k.startswith("video_network."):
+            k = k[len("video_network."):]
+            state_dict[k] = v
+    model.load_state_dict(state_dict)
+
+    layer_activation_format = {
+        "base.stem": "CTHW",
+        **{f"base.layer{i}": "CTHW" for i in range(1, 5)},
+        # "base.fc": "C",  # no fc
+    }
+
+    return PytorchWrapper(identifier, model, transform_video, fps=30, layer_activation_format=layer_activation_format)
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_GDT/requirements.txt b/brainscore_vision/models/temporal_model_GDT/requirements.txt
new file mode 100644
index 000000000..35ea5ddb9
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_GDT/requirements.txt
@@ -0,0 +1,3 @@
+gdt_model @ git+https://github.com/YingtianDt/GDT.git
+torch
+torchvision
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_GDT/test.py b/brainscore_vision/models/temporal_model_GDT/test.py
new file mode 100644
index 000000000..f3092c785
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_GDT/test.py
@@ -0,0 +1,17 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "GDT-Kinetics400",
+    "GDT-HowTo100M",
+    "GDT-IG65M",
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/__init__.py b/brainscore_vision/models/temporal_model_S3D_text_video/__init__.py
new file mode 100644
index 000000000..5b7c12472
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_S3D_text_video/__init__.py
@@ -0,0 +1,14 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry["s3d-HowTo100M"] = lambda: commit_model("s3d-HowTo100M")
diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/model.py b/brainscore_vision/models/temporal_model_S3D_text_video/model.py
new file mode 100644
index 000000000..d463caf9b
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_S3D_text_video/model.py
@@ -0,0 +1,65 @@
+import torch
+import numpy as np
+from torchvision import transforms
+from s3dg_howto100m import S3D
+
+from brainscore_vision.model_helpers.activations.temporal.model.pytorch import PytorchWrapper
+from brainscore_vision.model_helpers.s3 import load_weight_file
+
+
+img_transform = transforms.Compose([
+    transforms.Resize((256, 256)),
+])
+
+def transform_video(video):
+    frames = video.to_numpy() / 255.
+    frames = torch.Tensor(frames)
+    frames = frames.permute(0, 3, 1, 2)
+    frames = img_transform(frames)
+    return frames.permute(1, 0, 2, 3)
+
+
+def get_model(identifier="s3d-HowTo100M"):
+    inferencer_kwargs = {
+        "fps": 24,  # common YouTube frame rate
+        "layer_activation_format": 
+        {
+            "conv1": "CTHW",
+            "conv_2c": "CTHW",
+            "mixed_3c": "CTHW",
+            "mixed_4b": "CTHW",
+            "mixed_4d": "CTHW",
+            "mixed_4f": "CTHW",
+            "mixed_5c": "CTHW",
+            "fc": "C"
+        },
+    }
+    process_output = None
+
+    model_name = identifier
+
+    model_pth = load_weight_file(
+        bucket="brainscore-vision",
+        relative_path="temporal_model_S3D_text_video/s3d_howto100m.pth",
+        version_id="hRp6I8bpwreIMUVL0H.zCdK0hqRggL7n",
+        sha1="31e99d2a1cd48f2259ca75e719ac82c8b751ea75"
+    )
+
+    dict_pth = load_weight_file(
+        bucket="brainscore-vision",
+        relative_path="temporal_model_S3D_text_video/s3d_dict.npy",
+        version_id="4NxVLe8DSL6Uue0F7e2rz8HZuOk.tkBI",
+        sha1="d368ff7d397ec8240f1f963b5efe8ff245bac35f"
+    )
+
+    # Instantiate the model
+    model = S3D(dict_pth, 512)
+
+    # Load the model weights
+    model.load_state_dict(torch.load(model_pth))
+
+    wrapper = PytorchWrapper(identifier, model, transform_video, 
+                             process_output=process_output,
+                             **inferencer_kwargs)
+    
+    return wrapper
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt b/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt
new file mode 100644
index 000000000..73f27f3b6
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_S3D_text_video/requirements.txt
@@ -0,0 +1 @@
+S3D_HowTo100M @ git+https://github.com/YingtianDt/S3D_HowTo100M
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_S3D_text_video/test.py b/brainscore_vision/models/temporal_model_S3D_text_video/test.py
new file mode 100644
index 000000000..e6c7fdb18
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_S3D_text_video/test.py
@@ -0,0 +1,15 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "s3d-HowTo100M",
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_SeLaVi/__init__.py b/brainscore_vision/models/temporal_model_SeLaVi/__init__.py
new file mode 100644
index 000000000..68f5deecf
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_SeLaVi/__init__.py
@@ -0,0 +1,17 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry["SeLaVi-Kinetics400"] = lambda: commit_model("SeLaVi-Kinetics400")
+model_registry["SeLaVi-Kinetics-Sound"] = lambda: commit_model("SeLaVi-Kinetics-Sound")
+model_registry["SeLaVi-VGG-Sound"] = lambda: commit_model("SeLaVi-VGG-Sound")
+model_registry["SeLaVi-AVE"] = lambda: commit_model("SeLaVi-AVE")
diff --git a/brainscore_vision/models/temporal_model_SeLaVi/model.py b/brainscore_vision/models/temporal_model_SeLaVi/model.py
new file mode 100644
index 000000000..d6c34eb02
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_SeLaVi/model.py
@@ -0,0 +1,68 @@
+import torch
+
+from selavi.model import load_model
+from selavi.video_transforms import clip_augmentation
+
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
+from brainscore_vision.model_helpers.activations.temporal.utils import download_weight_file
+
+
+def transform_video(video):
+    arr = video.to_numpy()
+    arr = torch.as_tensor(arr)
+    return clip_augmentation(arr)
+
+
+def get_model(identifier):
+
+    assert identifier.startswith("SeLaVi-")
+    dataset = "-".join(identifier.split("-")[1:])
+
+    if dataset == "Kinetics400":
+        model_name = "selavi_kinetics.pth"
+        num_classes = 400
+    elif dataset == "Kinetics-Sound":
+        model_name = "selavi_kinetics_sound.pth"
+        num_classes = 32
+    elif dataset == "VGG-Sound":
+        model_name = "selavi_vgg_sound.pth"
+        num_classes = 309
+    elif dataset == "AVE":
+        model_name = "selavi_ave.pth"
+        num_classes = 28
+    else:
+        raise ValueError(f"Unknown dataset: {dataset}")
+    
+    url = f"https://dl.fbaipublicfiles.com/selavi/{model_name}"
+    pth = download_weight_file(url, folder="temporal_model_SeLaVi")
+
+    # Load model
+    model = load_model(
+        vid_base_arch="r2plus1d_18",
+        aud_base_arch="resnet9",
+        use_mlp=True,
+        num_classes=num_classes,
+        pretrained=False,
+        norm_feat=False,
+        use_max_pool=False,
+        headcount=10,
+    )
+
+    model = model.video_network  # Remove audio network
+
+    # Load weights
+    state_dict_ = torch.load(pth, map_location="cpu")['model']
+    state_dict = {}
+    for k, v in list(state_dict_.items()):
+        if k.startswith("module.video_network."):
+            k = k[len("module.video_network."):]
+            state_dict[k] = v
+    model.load_state_dict(state_dict)
+
+    layer_activation_format = {
+        "base.stem": "CTHW",
+        **{f"base.layer{i}": "CTHW" for i in range(1, 5)},
+        # "base.fc": "C",  # no fc
+    }
+
+    return PytorchWrapper(identifier, model, transform_video, fps=30, layer_activation_format=layer_activation_format)
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_SeLaVi/requirements.txt b/brainscore_vision/models/temporal_model_SeLaVi/requirements.txt
new file mode 100644
index 000000000..ecc9cdae6
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_SeLaVi/requirements.txt
@@ -0,0 +1,3 @@
+torch
+torchvision
+selavi @ git+https://github.com/YingtianDt/selavi.git
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_SeLaVi/test.py b/brainscore_vision/models/temporal_model_SeLaVi/test.py
new file mode 100644
index 000000000..1ad8c439c
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_SeLaVi/test.py
@@ -0,0 +1,18 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "SeLaVi-Kinetics400",
+    "SeLaVi-Kinetics-Sound",
+    "SeLaVi-VGG-Sound",
+    "SeLaVi-AVE"
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/__init__.py b/brainscore_vision/models/temporal_model_VideoMAEv2/__init__.py
new file mode 100644
index 000000000..1579fc0e4
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_VideoMAEv2/__init__.py
@@ -0,0 +1,14 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+model_registry["VideoMAE-V2-B"] = lambda: commit_model("VideoMAE-V2-B")
+model_registry["VideoMAE-V2-G"] = lambda: commit_model("VideoMAE-V2-G")
diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/model.py b/brainscore_vision/models/temporal_model_VideoMAEv2/model.py
new file mode 100644
index 000000000..cb0f2f94c
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_VideoMAEv2/model.py
@@ -0,0 +1,117 @@
+
+import numpy as np
+import torch
+from timm.models import create_model
+from torchvision import transforms
+
+# NOTE: Do not comment `import models`, it is used to register models
+from videomae_v2 import *  # noqa: F401
+
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
+from brainscore_vision.model_helpers.s3 import load_weight_file
+
+
+LAYER_SELECT_STEP = 2
+
+def to_normalized_float_tensor(vid):
+    vid = torch.Tensor(vid.to_numpy())
+    return vid.permute(3, 0, 1, 2).to(torch.float32) / 255
+
+# NOTE: for those functions, which generally expect mini-batches, we keep them
+# as non-minibatch so that they are applied as if they were 4d (thus image).
+# this way, we only apply the transformation in the spatial domain
+def resize(vid, size, interpolation='bilinear'):
+    # NOTE: using bilinear interpolation because we don't work on minibatches
+    # at this level
+    scale = None
+    if isinstance(size, int):
+        scale = float(size) / min(vid.shape[-2:])
+        size = None
+    return torch.nn.functional.interpolate(
+        vid,
+        size=size,
+        scale_factor=scale,
+        mode=interpolation,
+        align_corners=False)
+
+class ToFloatTensorInZeroOne(object):
+
+    def __call__(self, vid):
+        return to_normalized_float_tensor(vid)
+
+
+class Resize(object):
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, vid):
+        return resize(vid, self.size)
+    
+
+transform_video = transforms.Compose(
+    [ToFloatTensorInZeroOne(),
+        Resize((224, 224))])
+
+def get_model(identifier):
+
+    if identifier == "VideoMAE-V2-G":
+        model_name = "vit_giant_patch14_224"
+        pth = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_VideoMAEv2/vit_g_hybrid_pt_1200e.pth",
+            version_id="TxtkfbeMV105dzpzTwi0Kn5glnvQvIrq",
+            # sha1="9048f2bc0b0c7ba4d0e5228f3a7c0bef4dbaca69",
+            sha1="32126231526fe310a6aba20c16d0e6435f5f0bb8"
+        )
+        num_blocks = 40
+        feature_map_size = 16
+    elif identifier == "VideoMAE-V2-B":
+        model_name = "vit_base_patch16_224"
+        pth = load_weight_file(
+            bucket="brainscore-vision",
+            relative_path="temporal_model_VideoMAEv2/vit_b_hybrid_pt_800e.pth",
+            version_id="rRjpYq21dAQ5KaCLbEHK.YaLZ_fbMPKw",
+            sha1="1e3602691964b1eb6f7c33529119243a5b235635"
+        )
+        num_blocks = 12
+        feature_map_size = 14
+        
+    num_frames = 16
+
+    model = create_model(model_name)
+    
+    ckpt = torch.load(pth, map_location='cpu')
+    for model_key in ['model', 'module']:
+        if model_key in ckpt:
+            ckpt = ckpt[model_key]
+            break
+
+    encoder_ckpt = {}
+    for k, v in ckpt.items():
+        if k.startswith("encoder."):
+            encoder_ckpt[k[8:]] = v
+
+    msg = model.load_state_dict(encoder_ckpt, strict=False)
+    print(msg)
+
+    inferencer_kwargs = {
+        "fps": 6.25,
+        "layer_activation_format": {
+            "patch_embed": "THWC",
+            **{f"blocks.{i}": "THWC" for i in range(0, num_blocks, LAYER_SELECT_STEP)},
+            # "head": "THWC"  # weight not available
+        },
+        "num_frames": num_frames,
+    }
+
+    def process_activation(layer, layer_name, inputs, output):
+        B = output.shape[0]
+        C = output.shape[-1]
+        output = output.reshape(B, -1, feature_map_size, feature_map_size, C)
+        return output
+
+    wrapper = PytorchWrapper(identifier, model, transform_video, 
+                                process_output=process_activation,
+                                **inferencer_kwargs)
+    return wrapper
diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/requirements.txt b/brainscore_vision/models/temporal_model_VideoMAEv2/requirements.txt
new file mode 100644
index 000000000..5af2d95b6
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_VideoMAEv2/requirements.txt
@@ -0,0 +1,4 @@
+torch
+torchvision
+timm
+videomae_v2 @ git+https://github.com/YingtianDt/VideoMAEv2.git
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_VideoMAEv2/test.py b/brainscore_vision/models/temporal_model_VideoMAEv2/test.py
new file mode 100644
index 000000000..905296c3a
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_VideoMAEv2/test.py
@@ -0,0 +1,16 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "VideoMAE-V2-B",
+    "VideoMAE-V2-G",
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_mae_st/__init__.py b/brainscore_vision/models/temporal_model_mae_st/__init__.py
new file mode 100644
index 000000000..6eb152a20
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mae_st/__init__.py
@@ -0,0 +1,15 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry["MAE-ST-L"] = lambda: commit_model("MAE-ST-L")
+model_registry["MAE-ST-G"] = lambda: commit_model("MAE-ST-G")
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_mae_st/model.py b/brainscore_vision/models/temporal_model_mae_st/model.py
new file mode 100644
index 000000000..fc8f399b5
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mae_st/model.py
@@ -0,0 +1,100 @@
+import torch
+from iopath.common.file_io import g_pathmgr as pathmgr
+from mae_st import models_vit 
+from mae_st.util import misc 
+from mae_st.util.pos_embed import interpolate_pos_embed
+
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
+from brainscore_vision.model_helpers.s3 import load_weight_file
+
+
+LAYER_SELECT_STEP = 2
+mean = (0.45, 0.45, 0.45)
+std = (0.225, 0.225, 0.225)
+
+from torchvision import transforms
+
+transform_img = transforms.Compose([
+    transforms.Resize((224, 224)),
+    transforms.Normalize(mean, std),
+])
+
+
+def transform_video(video):
+    import torch
+    frames = torch.Tensor(video.to_numpy() / 255.0).permute(0, 3, 1, 2)
+    frames = transform_img(frames)
+    return frames.permute(1, 0, 2, 3)
+
+
+
+def get_model(identifier):
+
+    if identifier == "MAE-ST-L":
+        model_name = "vit_large_patch16"
+        num_blocks = 24
+        feature_map_size = 14
+        load_path = load_weight_file(
+            bucket="brainscore-vision", 
+            relative_path="temporal_model_mae_st/mae_pretrain_vit_large_k400.pth", 
+            version_id="cPcP4AzpG95CimQ5Pn.CHKnGUJlLXM3m",
+            sha1="c7fb91864a4ddf8b99309440121a3abe66b846bb"
+        )
+
+    elif identifier == "MAE-ST-G":
+        model_name = "vit_huge_patch14"
+        num_blocks = 32
+        feature_map_size = 16
+        load_path = load_weight_file(
+            bucket="brainscore-vision", 
+            relative_path="temporal_model_mae_st/mae_pretrain_vit_huge_k400.pth", 
+            version_id="IYKa8QiocgBzo3EhsBouS62HboK6iqYT",
+            sha1="177e48577142ca01949c08254834ffa1198b9eb4"
+        )
+
+    num_frames = 16
+    t_patch_size = 2
+
+    model = models_vit.__dict__[model_name](
+        num_frames=num_frames,
+        t_patch_size=t_patch_size
+    )
+
+    with pathmgr.open(load_path, "rb") as f:
+        checkpoint = torch.load(f, map_location="cpu")
+
+    print("Load pre-trained checkpoint from: %s" % load_path)
+    if "model" in checkpoint.keys():
+        checkpoint_model = checkpoint["model"]
+    else:
+        checkpoint_model = checkpoint["model_state"]
+    # interpolate position embedding
+    interpolate_pos_embed(model, checkpoint_model)
+
+    checkpoint_model = misc.convert_checkpoint(checkpoint_model)
+
+    # load pre-trained model
+    msg = model.load_state_dict(checkpoint_model, strict=False)
+    print(msg)
+
+    inferencer_kwargs = {
+        "fps": 6.25,
+        "layer_activation_format": {
+            "patch_embed": "THWC",
+            **{f"blocks.{i}": "THWC" for i in range(0, num_blocks, LAYER_SELECT_STEP)},
+            # "head": "THWC"  # weight not available
+        },
+        "num_frames": num_frames,
+    }
+
+    def process_activation(layer, layer_name, inputs, output):
+        B = output.shape[0]
+        C = output.shape[-1]
+        output = output.reshape(B, -1, feature_map_size, feature_map_size, C)
+        return output
+
+    wrapper = PytorchWrapper(identifier, model, transform_video, 
+                                process_output=process_activation,
+                                **inferencer_kwargs)
+
+    return wrapper
diff --git a/brainscore_vision/models/temporal_model_mae_st/requirements.txt b/brainscore_vision/models/temporal_model_mae_st/requirements.txt
new file mode 100644
index 000000000..0d1858c8e
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mae_st/requirements.txt
@@ -0,0 +1,3 @@
+mae_st @ git+https://github.com/YingtianDt/mae_st.git
+torch
+torchvision
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_mae_st/test.py b/brainscore_vision/models/temporal_model_mae_st/test.py
new file mode 100644
index 000000000..0f2b2cb56
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mae_st/test.py
@@ -0,0 +1,16 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "MAE-ST-L",
+    "MAE-ST-G",
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_mmaction2/__init__.py b/brainscore_vision/models/temporal_model_mmaction2/__init__.py
new file mode 100644
index 000000000..685183ff9
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mmaction2/__init__.py
@@ -0,0 +1,23 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry["I3D"] = lambda: commit_model("I3D")
+model_registry["I3D-nonlocal"] = lambda: commit_model("I3D-nonlocal")
+model_registry["SlowFast"] = lambda: commit_model("SlowFast")
+model_registry["X3D"] = lambda: commit_model("X3D")
+model_registry["TimeSformer"] = lambda: commit_model("TimeSformer")
+model_registry["VideoSwin-B"] = lambda: commit_model("VideoSwin-B")
+model_registry["VideoSwin-L"] = lambda: commit_model("VideoSwin-L")
+model_registry["UniFormer-V1"] = lambda: commit_model("UniFormer-V1")
+model_registry["UniFormer-V2-B"] = lambda: commit_model("UniFormer-V2-B")
+model_registry["UniFormer-V2-L"] = lambda: commit_model("UniFormer-V2-L")
diff --git a/brainscore_vision/models/temporal_model_mmaction2/mmaction2.csv b/brainscore_vision/models/temporal_model_mmaction2/mmaction2.csv
new file mode 100644
index 000000000..9f253b57f
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mmaction2/mmaction2.csv
@@ -0,0 +1,24 @@
+name,config,checkpoint,Kinetics400-top1,Kinetics400-top5,FLOPs,params,note
+I3D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50_8xb8-32x2x1-100e_kinetics400-rgb_20220812-e213c223.pth,73.47,91.27,43.5G,28.0M,
+I3D-nonlocal,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth,,,,,
+TSM,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50_8xb16-1x1x8-50e_kinetics400-rgb_20220831-64d69186.pth,75.12,91.55,65.75G,23.87M,
+SlowFast,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/slowfast/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb/slowfast_r50_8xb8-8x8x1-256e_kinetics400-rgb_20220818-1cb6dfc8.pth,,,,,
+X3D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/x3d/x3d_m_16x5x1_facebook-kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/x3d/facebook/x3d_m_16x5x1_facebook-kinetics400-rgb_20201027-3f42382a.pth,,,,,
+TimeSformer,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/timesformer/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb/timesformer_divST_8xb8-8x32x1-15e_kinetics400-rgb_20220815-a4d0d01f.pth,,,,,
+VideoSwin-B,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth,,,,,
+VideoSwin-L,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-large-p244-w877_in22k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-78ad8b11.pth,,,,,
+UniFormer-V1,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformer/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv1/uniformer-base_imagenet1k-pre_32x4x1_kinetics400-rgb_20221219-b776322c.pth,,,,,
+UniFormer-V2-B,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth,,,,,
+UniFormer-V2-L,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/kinetics400/uniformerv2-large-p14-res224_clip-kinetics710-pre_u32_kinetics400-rgb_20221219-56a46f64.pth,,,,,
+VideoMAE-V2,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400.py,https://download.openmmlab.com/mmaction/v1.0/recognition/videomaev2/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400/vit-base-p16_videomaev2-vit-g-dist-k710-pre_16x4x1_kinetics-400_20230510-3e7f93b2.pth,86.6,97.3,180G,87M,
+VideoMAE-V1,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400.py,https://download.openmmlab.com/mmaction/v1.0/recognition/videomae/vit-base-p16_videomae-k400-pre_16x4x1_kinetics-400_20221013-860a3cd3.pth,81.3,95.0,180G,87M,
+R2plus1D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/r2plus1d/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb/r2plus1d_r34_8xb8-32x2x1-180e_kinetics400-rgb_20220812-4270588c.pth,75.46,92.28,213G,63.8M,
+I3D-nonlocal,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/i3d/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb/i3d_imagenet-pretrained-r50-nl-dot-product_8xb8-32x2x1-100e_kinetics400-rgb_20220812-8e1f2148.pth,74.80,92.07,59.3G,35.4M,
+TSN,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/tsn/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb/tsn_imagenet-pretrained-r50_8xb32-1x1x8-100e_kinetics400-rgb_20220906-2692d16c.pth,74.12,91.34,102.7G,24.33M,
+C3D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/c3d/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb/c3d_sports1m-pretrained_8xb30-16x1x1-45e_ucf101-rgb_20220811-31723200.pth,,,38.5G,78.4M,
+UniFormer-V2,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/uniformerv2/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb/uniformerv2-base-p16-res224_clip-kinetics710-pre_8xb32-u8_kinetics400-rgb_20230313-75be0806.pth,,,0.1T,115M,
+VideoSwin,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/swin/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb/swin-base-p244-w877_in1k-pre_8xb8-amp-32x2x1-30e_kinetics400-rgb_20220930-182ec6cc.pth,80.57,94.49,282G,88.0M,
+C2D,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/c2d/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb/c2d_r50-in1k-pre-nopool_8xb32-8x8x1-100e_kinetics400-rgb_20221027-e0227b22.pth,73.44,91.00,33G,24.3M,
+TSM-nonlocal,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/tsm/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb/tsm_imagenet-pretrained-r50-nl-dot-product_8xb16-1x1x8-50e_kinetics400-rgb_20220831-108bfde5.pth,74.49,91.15,61.30G,31.68M,
+CSN,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/csn/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb/ircsn_ig65m-pretrained-r50-bnfrozen_8xb12-32x2x1-58e_kinetics400-rgb_20220811-44395bae.pth,79.44,94.26,55.90G,13.13M,
+TPN,https://github.com/open-mmlab/mmaction2/blob/main/configs/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb.py,https://download.openmmlab.com/mmaction/v1.0/recognition/tpn/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb/tpn-slowonly_r50_8xb8-8x8x1-150e_kinetics400-rgb_20220913-97d0835d.pth,74.20,91.48,,,
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_mmaction2/model.py b/brainscore_vision/models/temporal_model_mmaction2/model.py
new file mode 100644
index 000000000..351158842
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mmaction2/model.py
@@ -0,0 +1,226 @@
+import os
+import numpy as np
+
+import mmengine
+import mmaction
+from mmaction.apis import init_recognizer
+from mmengine.registry import init_default_scope
+from mmengine.dataset import Compose, pseudo_collate
+
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
+
+
+LARGE_MODEL_LAYER_STEP = 2
+HOME = os.path.join(os.path.dirname(mmaction.__file__), "models")
+
+
+class MMActionModelWrapper(PytorchWrapper):
+    meta = None
+
+    def load_meta(self, path=os.path.join(os.path.dirname(__file__), "mmaction2.csv")):
+        if self.meta is None:
+            import pandas as pd
+            self.meta = pd.read_csv(path)
+    
+    def __init__(self, model_name, process_output=None, *args, **kwargs):
+        self.load_meta()
+
+        _num_frames = None
+        num_frames = kwargs.get("num_frames")
+        if isinstance(num_frames, (list, tuple)):
+            if num_frames[0] == num_frames[1]:
+                _num_frames = num_frames
+        elif num_frames is not None:
+            _num_frames = num_frames
+
+        model_data = self.meta[self.meta['name'] == model_name].iloc[0]  # return a Series
+        config = model_data['config']
+        checkpoint = model_data['checkpoint']
+        config = config.replace("https://github.com/open-mmlab/mmaction2/blob/main/", "")
+        config_path = os.path.join(HOME, config)
+        config = mmengine.Config.fromfile(config_path)
+
+        test_pipeline_cfg = config.test_pipeline
+        # SampleFrames: clip_len x frame_interval (sampling interval) x num_clips
+        # change every ThreeCrop and TenCrop to CenterCrop
+        for i, pipeline in enumerate(test_pipeline_cfg):
+            if pipeline['type'] in ['ThreeCrop', 'TenCrop']:
+                test_pipeline_cfg[i] = {'type': 'CenterCrop', 'crop_size': pipeline['crop_size']}
+            if pipeline['type'] in ['SampleFrames']:
+                test_pipeline_cfg[i].update({"num_clips": 1, 'frame_interval': 1})
+
+        model = init_recognizer(config, checkpoint, device="cpu")
+        init_default_scope(model.cfg.get('default_scope', 'mmaction'))
+        test_pipeline = Compose(test_pipeline_cfg[3:])
+
+        def transform_video(video):
+            imgs = video.to_numpy()
+            data = {'imgs': imgs, 'num_clips': 1, 'modality': 'RGB'}
+            if _num_frames is not None:
+                data['clip_len'] = _num_frames
+                assert len(imgs) == _num_frames
+            else:
+                data['clip_len'] = len(imgs)
+
+            data = test_pipeline(data)
+            return data
+        
+        super().__init__(model_name, model, transform_video, process_output, *args, **kwargs)
+
+    def forward(self, inputs):
+        data = pseudo_collate(inputs)
+        data["inputs"] = [d.to(self._device) for d in data["inputs"]]
+        result = self._model.test_step(data)[0]
+        return result
+    
+
+def get_model(identifier):
+    if identifier == "I3D":
+        process_output = None
+        inferencer_kwargs = {
+            "fps": 12.5,
+            "layer_activation_format": {
+                "backbone.conv1": "CTHW",  # too large: (C: 64, T: *, H: 128, W: 128)
+                **{f"backbone.layer{i}": "CTHW" for i in range(1, 5)},
+                "cls_head": "C",
+            },
+            "num_frames": (5, np.inf),
+        }
+
+    if identifier == "I3D-nonlocal":
+        process_output = None
+        inferencer_kwargs = {
+            "fps": 12.5,
+            "layer_activation_format": {
+                "backbone.conv1": "CTHW",  # too large: (C: 64, T: *, H: 128, W: 128)
+                **{f"backbone.layer{i}": "CTHW" for i in range(1, 5)},
+                "cls_head": "C",
+            },
+            "num_frames": (5, np.inf),
+        }
+
+    if identifier == "TSM":
+        process_output = None
+        inferencer_kwargs = {
+            "fps": 25,
+            "layer_activation_format": {},
+        }
+
+    if identifier == "SlowFast":
+        process_output = None
+        inferencer_kwargs = {
+            "fps": 12.5,
+            "layer_activation_format": {
+                "backbone.slow_path.conv1_lateral": "CTHW",
+                **{f"backbone.slow_path.layer{i}_lateral": "CTHW" for i in range(1, 4)},
+                "cls_head": "C",
+            },
+            "num_frames": 32,  # TODO: in fact can be multiple of 4?
+        }
+
+    if identifier == "X3D":
+        process_output = None
+        inferencer_kwargs = {
+            "fps": 30,
+            "layer_activation_format": {
+                "backbone.conv1_t": "CTHW",
+                **{f"backbone.layer{i}": "CTHW" for i in range(1, 5)},
+                "cls_head": "C",
+            },
+        }
+
+    if identifier == "TimeSformer":
+        inferencer_kwargs = {
+            "fps": 8,
+            "layer_activation_format": {
+                "backbone.patch_embed": "CTHW",
+                **{f"backbone.transformer_layers.layers.{i}": "HWTC" for i in range(0, 12, LARGE_MODEL_LAYER_STEP)},
+                "cls_head": "C",
+            },
+            "num_frames": 8
+        }
+        def process_output(layer, layer_name, inputs, output):
+            if layer_name == "backbone.patch_embed":
+                B = inputs[0].shape[0]
+                C = output.shape[-1]
+                output = output.reshape(B, -1, 14, 14, C)
+            if layer_name.startswith("backbone.transformer_layers.layers."):
+                output = output[:, 1:]
+                B = output.shape[0]
+                C = output.shape[-1]
+                output = output.reshape(B, 14, 14, -1, C)
+            return output
+        
+    if identifier in ["VideoSwin-B", "VideoSwin-L"]:
+
+        transformer_layers = {
+            **{f"backbone.layers.0.blocks.{i}": "THWC" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)},
+            **{f"backbone.layers.1.blocks.{i}": "THWC" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)},
+            **{f"backbone.layers.2.blocks.{i}": "THWC" for i in range(0, 18, LARGE_MODEL_LAYER_STEP)},
+            **{f"backbone.layers.3.blocks.{i}": "THWC" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)},
+        }
+
+        inferencer_kwargs = {
+            "fps": 12.5,
+            "layer_activation_format": {
+                "backbone.patch_embed": "CTHW",
+                **transformer_layers,
+                "cls_head": "C",
+            },
+        }
+        process_output = None
+
+    if identifier == "UniFormer-V1":
+
+        transformer_layers = {
+            **{f"backbone.blocks1.{i}": "CTHW" for i in range(0, 5, LARGE_MODEL_LAYER_STEP)},
+            **{f"backbone.blocks2.{i}": "CTHW" for i in range(0, 8, LARGE_MODEL_LAYER_STEP)},
+            **{f"backbone.blocks3.{i}": "CTHW" for i in range(0, 20, LARGE_MODEL_LAYER_STEP)},
+            **{f"backbone.blocks4.{i}": "CTHW" for i in range(0, 7, LARGE_MODEL_LAYER_STEP)},
+        }
+
+        inferencer_kwargs = {
+            "fps": 6.25,
+            "layer_activation_format": {
+                "backbone.pos_drop": "CTHW",
+                **transformer_layers,
+                "cls_head": "C",
+            },
+        }
+        process_output = None
+
+    if identifier.startswith("UniFormer-V2"):
+
+        if identifier == "UniFormer-V2-B":
+            num_frames = 8
+            num_transformer_layers = 12
+            img_size = 14
+        elif identifier == "UniFormer-V2-L":
+            num_frames = 32
+            num_transformer_layers = 24
+            img_size = 16
+
+        transformer_layers = {
+            **{f"backbone.transformer.resblocks.{i}": "HWTC" for i in range(0, num_transformer_layers, LARGE_MODEL_LAYER_STEP)},
+        }
+
+        inferencer_kwargs = {
+            "fps": 25,
+            "layer_activation_format": {
+                "backbone.conv1": "CTHW",
+                **transformer_layers,
+                "backbone": "C",
+                "cls_head": "C",
+            },
+            "num_frames": num_frames
+        }
+        def process_output(layer, layer_name, inputs, output):
+            if layer_name.startswith("backbone.transformer.resblocks."):
+                T = inputs[1]
+                C = output.shape[-1]
+                output = output[1:]  # remove the class token
+                output = output.reshape(img_size, img_size, -1, T, C).permute(2, 0, 1, 3, 4)  # BHWTC 
+            return output
+
+    model = MMActionModelWrapper(identifier, process_output, **inferencer_kwargs)
+    return model
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_mmaction2/requirements.txt b/brainscore_vision/models/temporal_model_mmaction2/requirements.txt
new file mode 100644
index 000000000..1caa728dd
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mmaction2/requirements.txt
@@ -0,0 +1,5 @@
+importlib-metadata<5
+mmaction2 @ git+https://github.com/YingtianDt/mmaction2.git@533edc3
+mmengine
+torch
+torchvision
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_mmaction2/test.py b/brainscore_vision/models/temporal_model_mmaction2/test.py
new file mode 100644
index 000000000..d539c623a
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_mmaction2/test.py
@@ -0,0 +1,24 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "I3D",
+    "I3D-nonlocal",
+    "SlowFast",
+    "X3D",
+    "TimeSformer",
+    "VideoSwin-B",
+    "VideoSwin-L",
+    "UniFormer-V1",
+    "UniFormer-V2-B",
+    "UniFormer-V2-L",
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_openstl/__init__.py b/brainscore_vision/models/temporal_model_openstl/__init__.py
new file mode 100644
index 000000000..2b49cc845
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_openstl/__init__.py
@@ -0,0 +1,19 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry["ConvLSTM"] = lambda: commit_model("ConvLSTM")
+model_registry["PredRNN"] = lambda: commit_model("PredRNN")
+# model_registry["PredNet"] = lambda: commit_model("PredNet")
+model_registry["SimVP"] = lambda: commit_model("SimVP")
+model_registry["TAU"] = lambda: commit_model("TAU")
+model_registry["MIM"] = lambda: commit_model("MIM")
diff --git a/brainscore_vision/models/temporal_model_openstl/model.py b/brainscore_vision/models/temporal_model_openstl/model.py
new file mode 100644
index 000000000..aed3e0464
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_openstl/model.py
@@ -0,0 +1,223 @@
+import os
+import imp
+import numpy as np
+from collections import OrderedDict
+
+import torch
+from torchvision import transforms
+import openstl
+from openstl.methods import method_maps
+from openstl.utils import reshape_patch
+
+from brainscore_vision.model_helpers.activations.temporal.model import PytorchWrapper
+from brainscore_vision.model_helpers.activations.temporal.utils import download_weight_file
+
+
+# We only use models trained on KITTI dataset, because it is the most ecological, 
+# diverse, challenging, and widely used dataset for next frame prediction among 
+# the datasets used by OpenSTL repo.
+IMAGE_SIZES = (128, 160)  # for KITTI
+KITTI_CONFIG_DIR = os.path.join(os.path.dirname(openstl.__file__), "configs/kitticaltech")
+KITTI_FPS = 10  # from paper: https://www.cvlibs.net/publications/Geiger2012CVPR.pdf
+LARGE_MODEL_LAYER_STEP = 2
+
+transform_image = transforms.Resize(IMAGE_SIZES)
+
+
+class LSTMWrapper(PytorchWrapper):
+    def _register_hook(self, layer, layer_name, target_dict):
+        def hook_function(_layer, _input, output, name=layer_name, target_dict=target_dict):
+            output = self._process_activation(_layer, name, _input, output)
+            target_dict.setdefault(name, []).append(PytorchWrapper._tensor_to_numpy(output)) 
+
+        hook = layer.register_forward_hook(hook_function)
+        return hook
+    
+    def get_activations(self, inputs, layer_names):
+        self._model.eval()
+        layer_results = OrderedDict()
+        hooks = []
+
+        for layer_name in layer_names:
+            layer = self.get_layer(layer_name)
+            hook = self._register_hook(layer, layer_name, target_dict=layer_results)
+            hooks.append(hook)
+
+        with torch.no_grad():
+            self.forward(inputs)
+
+        for hook in hooks:
+            hook.remove()
+
+        # stack the T dim to be the second dim
+        for layer_name, activations in layer_results.items():
+            layer_results[layer_name] = np.stack(activations, axis=1)
+
+        return layer_results
+
+    def forward(self, inputs):
+        tensor = torch.stack(inputs)
+        tensor = tensor.to(self._device)
+        return self._model(tensor, return_loss=False)
+    
+
+class MIMWrapper(LSTMWrapper):
+    def forward(self, inputs):
+        output = super().forward(inputs)
+        # clear MIMBlock.convlstm_c
+        def _clear_helper(module):
+            if hasattr(module, "convlstm_c"):
+                module.convlstm_c = None
+            for child in module.children():
+                _clear_helper(child)
+        _clear_helper(self._model)
+        return output
+
+
+def _get_config(name, parent_dir):
+    config = imp.load_source(name, os.path.join(parent_dir, f"{name}.py")).__dict__
+    config = {k: v for k, v in config.items() if not k.startswith("__")}
+    return config
+
+
+def get_model(identifier):
+    config = _get_config(identifier, KITTI_CONFIG_DIR)
+    config["method"] = config["method"].lower()
+    config['dataname'] = "kitticaltech"
+    config['dataname'] = "kitticaltech"
+    config['metrics'] = ['mse', 'mae']  # not in use, just to initialize the model
+    config['in_shape'] = [None, 3, *IMAGE_SIZES]
+
+    if identifier == "PredRNN":
+        layer_activation_format = {
+            **{f"cell_list.{i}": "TCHW" for i in range(4)},
+            "conv_last": "TCHW"
+        }
+
+        def process_output(layer, layer_name, inputs, output):
+            if layer_name.startswith("cell_list"):
+                h, c, m = output
+                return m
+            else:
+                return output
+        
+        wrapper_cls = LSTMWrapper
+        kwargs = {}
+        weight_name = "kitticaltech_predrnn_one_ep100.pth"
+
+    elif identifier == "PredNet":
+        layer_activation_format = {
+            **{f"layer{i}": "TCHW" for i in range(4)},
+            "layer5": "TCHW"
+        }
+
+        def process_output(layer, layer_name, inputs, output):
+            if layer_name.startswith("cell_list"):
+                h, c = output
+                return c
+            else:
+                return output
+        
+        wrapper_cls = LSTMWrapper
+        kwargs = {}
+        weight_name = "kitticaltech_prednet_one_ep100.pth"
+
+    elif identifier == "ConvLSTM":
+        layer_activation_format = {
+            **{f"cell_list.{i}": "TCHW" for i in range(4)},
+            "conv_last": "TCHW"
+        }
+
+        def process_output(layer, layer_name, inputs, output):
+            if layer_name.startswith("cell_list"):
+                h, c = output
+                return c
+            else:
+                return output
+        
+        wrapper_cls = LSTMWrapper
+        kwargs = {}
+        weight_name = "kitticaltech_convlstm_one_ep100.pth"
+
+    elif identifier in ["SimVP", "TAU"]:
+        num_frames = 10
+        layer_activation_format = {
+            **{f"enc.enc.{i}": "TCHW" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)},
+            **{f"hid.enc.{i}": "TCHW" for i in range(0, 6, LARGE_MODEL_LAYER_STEP)},
+            **{f"dec.dec.{i}": "TCHW" for i in range(0, 2, LARGE_MODEL_LAYER_STEP)},
+        }
+
+        config['in_shape'] = [num_frames, 3, *IMAGE_SIZES]
+        wrapper_cls = PytorchWrapper
+        kwargs = {"num_frames": num_frames}
+
+        def process_output(layer, layer_name, inputs, output):
+            if layer_name.startswith("enc") or layer_name.startswith("dec"):
+                output = output.view(-1, num_frames, *output.shape[1:])
+            elif layer_name.startswith("hid"):
+                output = output[:, None]  # time-compressed layers
+            return output
+        if identifier == "SimVP":
+            weight_name = "kitticaltech_simvp_gsta_one_ep100.pth"
+        elif identifier == "TAU":
+            weight_name = "kitticaltech_tau_one_ep100.pth"
+
+    elif identifier == "MIM":
+        layer_activation_format = {
+            **{f"stlstm_layer.{i}": "TCHW" for i in range(0, 4, LARGE_MODEL_LAYER_STEP)},
+            **{f"stlstm_layer_diff.{i}": "TCHW" for i in range(0, 3, LARGE_MODEL_LAYER_STEP)},
+            "conv_last": "TCHW"
+        }
+
+        def process_output(layer, layer_name, inputs, output):
+            if layer_name.startswith("stlstm_layer."):
+                h, c, m = output
+                ret = m
+            elif layer_name.startswith("stlstm_layer_diff."):
+                h, c = output
+                ret = c
+            else:
+                ret = output
+            return ret
+        
+        wrapper_cls = MIMWrapper
+        kwargs = {}
+        weight_name = "kitticaltech_mim_one_ep100.pth"
+
+
+    model = method_maps[config["method"]](**config).model
+    url = f"https://github.com/chengtan9907/OpenSTL/releases/download/kitti-weights/{weight_name}"
+    weight_path = download_weight_file(url, folder="temporal_model_openstl")
+    model.load_state_dict(torch.load(weight_path, map_location="cpu"))
+
+    def transform_video_lstm(video):
+        frames = torch.Tensor(video.to_numpy() / 255.0).permute(0, 3, 1, 2)
+        frames = transform_image(frames)
+        frames = frames.permute(0, 2, 3, 1)[None, :]  # BTHWC
+        patch_size = config["patch_size"]
+        assert 5 == frames.ndim
+        batch_size, seq_length, img_height, img_width, num_channels = frames.shape
+        a = frames.reshape(batch_size, seq_length,
+                                    img_height//patch_size, patch_size,
+                                    img_width//patch_size, patch_size,
+                                    num_channels)
+        b = a.transpose(3, 4)
+        patches = b.reshape(batch_size, seq_length,
+                                    img_height//patch_size,
+                                    img_width//patch_size,
+                                    patch_size*patch_size*num_channels)[0]
+        return patches
+    
+    def transform_video_simvp(video):
+        frames = torch.Tensor(video.to_numpy() / 255.0).permute(0, 3, 1, 2)
+        frames = transform_image(frames)
+        return frames
+    
+    if identifier in ("PredRNN", "ConvLSTM", "MIM"):
+        transform_video = transform_video_lstm  
+    else: 
+        transform_video = transform_video_simvp
+
+    return wrapper_cls(identifier, model, transform_video, fps=KITTI_FPS, 
+                          layer_activation_format=layer_activation_format,
+                          process_output=process_output, **kwargs)
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_openstl/requirements.txt b/brainscore_vision/models/temporal_model_openstl/requirements.txt
new file mode 100644
index 000000000..4d839c16f
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_openstl/requirements.txt
@@ -0,0 +1,3 @@
+torch
+torchvision
+openstl @ git+https://github.com/YingtianDt/OpenSTL.git
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_openstl/test.py b/brainscore_vision/models/temporal_model_openstl/test.py
new file mode 100644
index 000000000..4d52b76ce
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_openstl/test.py
@@ -0,0 +1,20 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "ConvLSTM",
+    "PredRNN",
+    "PredNet",
+    "SimVP",
+    "TAU",
+    "MIM"
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_torchvision/__init__.py b/brainscore_vision/models/temporal_model_torchvision/__init__.py
new file mode 100644
index 000000000..54820f87f
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_torchvision/__init__.py
@@ -0,0 +1,19 @@
+from brainscore_vision import model_registry
+from brainscore_vision.model_helpers.brain_transformation import ModelCommitment
+from brainscore_vision.model_helpers.activations.temporal.utils import get_specified_layers
+from brainscore_vision.model_interface import BrainModel
+from . import model
+
+
+def commit_model(identifier):
+    activations_model=model.get_model(identifier)
+    layers=get_specified_layers(activations_model)
+    return ModelCommitment(identifier=identifier, activations_model=activations_model, layers=layers)
+
+
+model_registry['r3d_18'] = lambda: commit_model('r3d_18')
+model_registry['r2plus1d_18'] = lambda: commit_model('r2plus1d_18')
+model_registry['mc3_18'] = lambda: commit_model('mc3_18')
+model_registry['s3d'] = lambda: commit_model('s3d')
+model_registry['mvit_v1_b'] = lambda: commit_model('mvit_v1_b')
+model_registry['mvit_v2_s'] = lambda: commit_model('mvit_v2_s')
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_torchvision/model.py b/brainscore_vision/models/temporal_model_torchvision/model.py
new file mode 100644
index 000000000..30d96aba8
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_torchvision/model.py
@@ -0,0 +1,92 @@
+import torch
+import numpy as np
+from torchvision import transforms
+from torchvision.models import video as vid
+
+from brainscore_vision.model_helpers.activations.temporal.model.pytorch import PytorchWrapper
+
+
+LARGE_MODEL_LAYER_STEP = 2
+
+def get_transform_video(transform_img):
+    def transform_video(video):
+        frames = video.to_numpy() / 255.
+        frames = torch.Tensor(frames)
+        frames = frames.permute(0, 3, 1, 2)
+        frames = transform_img(frames)
+        return frames.permute(1, 0, 2, 3)
+    return transform_video
+
+
+def get_model(identifier):
+    if identifier in ["r3d_18", "r2plus1d_18", "mc3_18"]:
+        img_transform = transforms.Compose([
+            transforms.Resize((128, 171)),
+            transforms.CenterCrop(112),
+            transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
+        ])
+        inferencer_kwargs = {
+            "fps": 25,
+            "layer_activation_format": 
+            {
+                "stem": "CTHW",
+                **{f'layer{i}': "CTHW" for i in range(1, 5)},
+                "avgpool": "CTHW",
+                "fc": "C"
+            },
+        }
+        process_output = None
+
+    elif identifier == "s3d":
+        img_transform = transforms.Compose([
+            transforms.Resize((256, 256)),
+            transforms.CenterCrop(224),
+            transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
+        ])
+        inferencer_kwargs = {
+            "fps": 15,
+            "num_frames": (13, np.inf),
+            "layer_activation_format":
+            {
+                **{f"features.{i}": "CTHW" for i in range(0, 16, LARGE_MODEL_LAYER_STEP)},
+                "avgpool": "CTHW",
+                "classifier": "CTHW"
+            }
+        }
+        process_output = None
+
+    elif identifier in ["mvit_v1_b", "mvit_v2_s"]:
+        img_transform = transforms.Compose([
+            transforms.Resize((256, 256)),
+            transforms.CenterCrop(224),
+            transforms.Normalize(mean=[0.43216, 0.394666, 0.37645], std=[0.22803, 0.22145, 0.216989])
+        ])
+        inferencer_kwargs = {
+            "fps": 7.5,
+            "num_frames": 16,
+            "layer_activation_format": {
+                "conv_proj": "CTHW",
+                **{f"blocks.{i}": "THWC" for i in range(0, 16, LARGE_MODEL_LAYER_STEP)},
+                "head": "C",
+            }
+        }
+
+        def process_output(layer, layer_name, input, output):
+            if layer_name.startswith("blocks"):
+                output, thw = output
+                t, h, w = thw
+                output = output[:, 1:]  # remove cls 
+                b, n, c = output.shape
+                assert n == t*h*w
+                output = output.view(b, t, h, w, c)
+                return output
+            return output
+
+    vid_transform = get_transform_video(img_transform)
+    model_name = identifier
+    model = getattr(vid, model_name)(weights="KINETICS400_V1")
+    wrapper = PytorchWrapper(identifier, model, vid_transform, 
+                             process_output=process_output,
+                             **inferencer_kwargs)
+    
+    return wrapper
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_torchvision/requirements.txt b/brainscore_vision/models/temporal_model_torchvision/requirements.txt
new file mode 100644
index 000000000..37f700a78
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_torchvision/requirements.txt
@@ -0,0 +1,2 @@
+torch
+torchvision
\ No newline at end of file
diff --git a/brainscore_vision/models/temporal_model_torchvision/test.py b/brainscore_vision/models/temporal_model_torchvision/test.py
new file mode 100644
index 000000000..77486ad03
--- /dev/null
+++ b/brainscore_vision/models/temporal_model_torchvision/test.py
@@ -0,0 +1,20 @@
+import pytest
+
+from brainscore_vision import load_model
+
+
+model_list = [
+    "r3d_18",
+    "r2plus1d_18",
+    "mc3_18",
+    "s3d",
+    "mvit_v1_b",
+    "mvit_v2_s",
+]
+
+@pytest.mark.private_access
+@pytest.mark.memory_intense
+@pytest.mark.parametrize("model_identifier", model_list)
+def test_load(model_identifier):
+    model = load_model(model_identifier)
+    assert model is not None
\ No newline at end of file