diff --git a/brainscore_vision/models/cvt_cvt_13_224_in1k_4/__init__.py b/brainscore_vision/models/cvt_cvt_13_224_in1k_4/__init__.py index 670fc96c5..5e228e5f8 100644 --- a/brainscore_vision/models/cvt_cvt_13_224_in1k_4/__init__.py +++ b/brainscore_vision/models/cvt_cvt_13_224_in1k_4/__init__.py @@ -3,7 +3,7 @@ from .model import get_model, get_layers -model_registry['cvt_cvt-13-224-in1k_4'] = \ - lambda: ModelCommitment(identifier='cvt_cvt-13-224-in1k_4', - activations_model=get_model('cvt_cvt-13-224-in1k_4'), - layers=get_layers('cvt_cvt-13-224-in1k_4')) \ No newline at end of file +model_registry['cvt_cvt-13-224-in1k_4_LucyV4'] = \ + lambda: ModelCommitment(identifier='cvt_cvt-13-224-in1k_4_LucyV4', + activations_model=get_model('cvt_cvt-13-224-in1k_4_LucyV4'), + layers=get_layers('cvt_cvt-13-224-in1k_4_LucyV4')) \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_13_224_in1k_4/model.py b/brainscore_vision/models/cvt_cvt_13_224_in1k_4/model.py index d6d819a90..807bb6700 100644 --- a/brainscore_vision/models/cvt_cvt_13_224_in1k_4/model.py +++ b/brainscore_vision/models/cvt_cvt_13_224_in1k_4/model.py @@ -12,7 +12,7 @@ def get_model(name): - assert name == 'cvt_cvt-13-224-in1k_4' + assert name == 'cvt_cvt-13-224-in1k_4_LucyV4' # https://huggingface.co/models?sort=downloads&search=cvt image_size = 224 processor = AutoFeatureExtractor.from_pretrained('microsoft/cvt-13') @@ -25,7 +25,7 @@ def get_model(name): def get_layers(name): - assert name == 'cvt_cvt-13-224-in1k_4' + assert name == 'cvt_cvt-13-224-in1k_4_LucyV4' layers = [] layers += [f'cvt.encoder.stages.0.layers.{i}' for i in range(1)] layers += [f'cvt.encoder.stages.1.layers.{i}' for i in range(2)] @@ -38,7 +38,15 @@ def get_bibtex(model_identifier): """ A method returning the bibtex reference of the requested model as a string. """ - return '' + return """@misc{wu2021cvtintroducingconvolutionsvision, + title={CvT: Introducing Convolutions to Vision Transformers}, + author={Haiping Wu and Bin Xiao and Noel Codella and Mengchen Liu and Xiyang Dai and Lu Yuan and Lei Zhang}, + year={2021}, + eprint={2103.15808}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2103.15808}, + }""" def load_preprocess_images(image_filepaths, image_size, processor=None, **kwargs): diff --git a/brainscore_vision/models/cvt_cvt_13_224_in1k_4/test.py b/brainscore_vision/models/cvt_cvt_13_224_in1k_4/test.py index 6474d8167..e8bd4d224 100644 --- a/brainscore_vision/models/cvt_cvt_13_224_in1k_4/test.py +++ b/brainscore_vision/models/cvt_cvt_13_224_in1k_4/test.py @@ -4,5 +4,5 @@ @pytest.mark.travis_slow def test_has_identifier(): - model = brainscore_vision.load_model('cvt_cvt-13-224-in1k_4') - assert model.identifier == 'cvt_cvt-13-224-in1k_4' \ No newline at end of file + model = brainscore_vision.load_model('cvt_cvt-13-224-in1k_4_LucyV4') + assert model.identifier == 'cvt_cvt-13-224-in1k_4_LucyV4' \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_13_384_in1k_4/__init__.py b/brainscore_vision/models/cvt_cvt_13_384_in1k_4/__init__.py new file mode 100644 index 000000000..946a5c2dd --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_13_384_in1k_4/__init__.py @@ -0,0 +1,9 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from .model import get_model, get_layers + + +model_registry['cvt_cvt-13-384-in1k_4_LucyV4'] = \ + lambda: ModelCommitment(identifier='cvt_cvt-13-384-in1k_4_LucyV4', + activations_model=get_model('cvt_cvt-13-384-in1k_4_LucyV4'), + layers=get_layers('cvt_cvt-13-384-in1k_4_LucyV4')) \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_13_384_in1k_4/model.py b/brainscore_vision/models/cvt_cvt_13_384_in1k_4/model.py new file mode 100644 index 000000000..7a3a5ae18 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_13_384_in1k_4/model.py @@ -0,0 +1,142 @@ +from brainscore_vision.model_helpers.check_submission import check_models +import functools +from transformers import AutoFeatureExtractor, CvtForImageClassification +from brainscore_vision.model_helpers.activations.pytorch import PytorchWrapper +from PIL import Image +import numpy as np +import torch + +""" +Template module for a base model submission to brain-score +""" + + +def get_model(name): + assert name == 'cvt_cvt-13-384-in1k_4_LucyV4' + # https://huggingface.co/models?sort=downloads&search=cvt + image_size = 384 + processor = AutoFeatureExtractor.from_pretrained('microsoft/cvt-13-384-1k') + model = CvtForImageClassification.from_pretrained('microsoft/cvt-13-384-1k') + preprocessing = functools.partial(load_preprocess_images, processor=processor, image_size=image_size) + wrapper = PytorchWrapper(identifier=name, model=model, preprocessing=preprocessing) + wrapper.image_size = image_size + + return wrapper + + +def get_layers(name): + assert name == 'cvt_cvt-13-384-in1k_4_LucyV4' + layers = [] + layers += [f'cvt.encoder.stages.0.layers.{i}' for i in range(1)] + layers += [f'cvt.encoder.stages.1.layers.{i}' for i in range(2)] + layers += [f'cvt.encoder.stages.2.layers.{i}' for i in range(10)] + layers += ['layernorm'] + return layers + + +def get_bibtex(model_identifier): + """ + A method returning the bibtex reference of the requested model as a string. + """ + return """@misc{wu2021cvtintroducingconvolutionsvision, + title={CvT: Introducing Convolutions to Vision Transformers}, + author={Haiping Wu and Bin Xiao and Noel Codella and Mengchen Liu and Xiyang Dai and Lu Yuan and Lei Zhang}, + year={2021}, + eprint={2103.15808}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2103.15808}, + }""" + + +def load_preprocess_images(image_filepaths, image_size, processor=None, **kwargs): + images = load_images(image_filepaths) + # images = [, ...] + images = [image.resize((image_size, image_size)) for image in images] + if processor is not None: + images = [processor(images=image, return_tensors="pt", **kwargs) for image in images] + if len(images[0].keys()) != 1: + raise NotImplementedError(f'unknown processor for getting model {processor}') + assert list(images[0].keys())[0] == 'pixel_values' + images = [image['pixel_values'] for image in images] + images = torch.cat(images) + images = images.cpu().numpy() + else: + images = preprocess_images(images, image_size=image_size, **kwargs) + return images + + +def load_images(image_filepaths): + return [load_image(image_filepath) for image_filepath in image_filepaths] + + +def load_image(image_filepath): + with Image.open(image_filepath) as pil_image: + if 'L' not in pil_image.mode.upper() and 'A' not in pil_image.mode.upper() \ + and 'P' not in pil_image.mode.upper(): # not binary and not alpha and not palletized + # work around to https://github.com/python-pillow/Pillow/issues/1144, + # see https://stackoverflow.com/a/30376272/2225200 + return pil_image.copy() + else: # make sure potential binary images are in RGB + rgb_image = Image.new("RGB", pil_image.size) + rgb_image.paste(pil_image) + return rgb_image + + +def preprocess_images(images, image_size, **kwargs): + preprocess = torchvision_preprocess_input(image_size, **kwargs) + images = [preprocess(image) for image in images] + images = np.concatenate(images) + return images + + +def torchvision_preprocess_input(image_size, **kwargs): + from torchvision import transforms + return transforms.Compose([ + transforms.Resize((image_size, image_size)), + torchvision_preprocess(**kwargs), + ]) + + +def torchvision_preprocess(normalize_mean=(0.485, 0.456, 0.406), normalize_std=(0.229, 0.224, 0.225)): + from torchvision import transforms + return transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=normalize_mean, std=normalize_std), + lambda img: img.unsqueeze(0) + ]) + + +def create_static_video(image, num_frames, normalize_0to1=False, channel_dim=3): + ''' + Create a static video with the same image in all frames. + Args: + image (PIL.Image.Image): Input image. + num_frames (int): Number of frames in the video. + Returns: + result (np.ndarray): np array of frames of shape (num_frames, height, width, 3). + ''' + frames = [] + for _ in range(num_frames): + frame = np.array(image) + if normalize_0to1: + frame = frame / 255. + if channel_dim == 1: + frame = frame.transpose(2, 0, 1) + frames.append(frame) + return np.stack(frames) + + +if __name__ == '__main__': + # Use this method to ensure the correctness of the BaseModel implementations. + # It executes a mock run of brain-score benchmarks. + check_models.check_base_models(__name__) + +""" +Notes on the error: + +- 'channel_x' key error: +# 'embeddings.patch_embeddings.projection', +https://github.com/search?q=repo%3Abrain-score%2Fmodel-tools%20channel_x&type=code + +""" diff --git a/brainscore_vision/models/cvt_cvt_13_384_in1k_4/requirements.txt b/brainscore_vision/models/cvt_cvt_13_384_in1k_4/requirements.txt new file mode 100644 index 000000000..b71f00d65 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_13_384_in1k_4/requirements.txt @@ -0,0 +1,4 @@ +numpy +torch +transformers==4.30.2 +pillow diff --git a/brainscore_vision/models/cvt_cvt_13_384_in1k_4/test.py b/brainscore_vision/models/cvt_cvt_13_384_in1k_4/test.py new file mode 100644 index 000000000..96ab93c14 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_13_384_in1k_4/test.py @@ -0,0 +1,8 @@ +import pytest +import brainscore_vision + + +@pytest.mark.travis_slow +def test_has_identifier(): + model = brainscore_vision.load_model('cvt_cvt-13-384-in1k_4_LucyV4') + assert model.identifier == 'cvt_cvt-13-384-in1k_4_LucyV4' \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/__init__.py b/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/__init__.py new file mode 100644 index 000000000..4c3b7529a --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/__init__.py @@ -0,0 +1,9 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from .model import get_model, get_layers + + +model_registry['cvt_cvt-13-384-in22k_finetuned-in1k_4_LucyV4'] = \ + lambda: ModelCommitment(identifier='cvt_cvt-13-384-in22k_finetuned-in1k_4_LucyV4', + activations_model=get_model('cvt_cvt-13-384-in22k_finetuned-in1k_4_LucyV4'), + layers=get_layers('cvt_cvt-13-384-in22k_finetuned-in1k_4_LucyV4')) \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/model.py b/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/model.py new file mode 100644 index 000000000..b397d0e97 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/model.py @@ -0,0 +1,142 @@ +from brainscore_vision.model_helpers.check_submission import check_models +import functools +from transformers import AutoFeatureExtractor, CvtForImageClassification +from brainscore_vision.model_helpers.activations.pytorch import PytorchWrapper +from PIL import Image +import numpy as np +import torch + +""" +Template module for a base model submission to brain-score +""" + + +def get_model(name): + assert name == 'cvt_cvt-13-384-in22k_finetuned-in1k_4_LucyV4' + # https://huggingface.co/models?sort=downloads&search=cvt + image_size = 384 + processor = AutoFeatureExtractor.from_pretrained('microsoft/cvt-13-384-22k') + model = CvtForImageClassification.from_pretrained('microsoft/cvt-13-384-22k') + preprocessing = functools.partial(load_preprocess_images, processor=processor, image_size=image_size) + wrapper = PytorchWrapper(identifier=name, model=model, preprocessing=preprocessing) + wrapper.image_size = image_size + + return wrapper + + +def get_layers(name): + assert name == 'cvt_cvt-13-384-in22k_finetuned-in1k_4_LucyV4' + layers = [] + layers += [f'cvt.encoder.stages.0.layers.{i}' for i in range(1)] + layers += [f'cvt.encoder.stages.1.layers.{i}' for i in range(2)] + layers += [f'cvt.encoder.stages.2.layers.{i}' for i in range(10)] + layers += ['layernorm'] + return layers + + +def get_bibtex(model_identifier): + """ + A method returning the bibtex reference of the requested model as a string. + """ + return """@misc{wu2021cvtintroducingconvolutionsvision, + title={CvT: Introducing Convolutions to Vision Transformers}, + author={Haiping Wu and Bin Xiao and Noel Codella and Mengchen Liu and Xiyang Dai and Lu Yuan and Lei Zhang}, + year={2021}, + eprint={2103.15808}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2103.15808}, + }""" + + +def load_preprocess_images(image_filepaths, image_size, processor=None, **kwargs): + images = load_images(image_filepaths) + # images = [, ...] + images = [image.resize((image_size, image_size)) for image in images] + if processor is not None: + images = [processor(images=image, return_tensors="pt", **kwargs) for image in images] + if len(images[0].keys()) != 1: + raise NotImplementedError(f'unknown processor for getting model {processor}') + assert list(images[0].keys())[0] == 'pixel_values' + images = [image['pixel_values'] for image in images] + images = torch.cat(images) + images = images.cpu().numpy() + else: + images = preprocess_images(images, image_size=image_size, **kwargs) + return images + + +def load_images(image_filepaths): + return [load_image(image_filepath) for image_filepath in image_filepaths] + + +def load_image(image_filepath): + with Image.open(image_filepath) as pil_image: + if 'L' not in pil_image.mode.upper() and 'A' not in pil_image.mode.upper() \ + and 'P' not in pil_image.mode.upper(): # not binary and not alpha and not palletized + # work around to https://github.com/python-pillow/Pillow/issues/1144, + # see https://stackoverflow.com/a/30376272/2225200 + return pil_image.copy() + else: # make sure potential binary images are in RGB + rgb_image = Image.new("RGB", pil_image.size) + rgb_image.paste(pil_image) + return rgb_image + + +def preprocess_images(images, image_size, **kwargs): + preprocess = torchvision_preprocess_input(image_size, **kwargs) + images = [preprocess(image) for image in images] + images = np.concatenate(images) + return images + + +def torchvision_preprocess_input(image_size, **kwargs): + from torchvision import transforms + return transforms.Compose([ + transforms.Resize((image_size, image_size)), + torchvision_preprocess(**kwargs), + ]) + + +def torchvision_preprocess(normalize_mean=(0.485, 0.456, 0.406), normalize_std=(0.229, 0.224, 0.225)): + from torchvision import transforms + return transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=normalize_mean, std=normalize_std), + lambda img: img.unsqueeze(0) + ]) + + +def create_static_video(image, num_frames, normalize_0to1=False, channel_dim=3): + ''' + Create a static video with the same image in all frames. + Args: + image (PIL.Image.Image): Input image. + num_frames (int): Number of frames in the video. + Returns: + result (np.ndarray): np array of frames of shape (num_frames, height, width, 3). + ''' + frames = [] + for _ in range(num_frames): + frame = np.array(image) + if normalize_0to1: + frame = frame / 255. + if channel_dim == 1: + frame = frame.transpose(2, 0, 1) + frames.append(frame) + return np.stack(frames) + + +if __name__ == '__main__': + # Use this method to ensure the correctness of the BaseModel implementations. + # It executes a mock run of brain-score benchmarks. + check_models.check_base_models(__name__) + +""" +Notes on the error: + +- 'channel_x' key error: +# 'embeddings.patch_embeddings.projection', +https://github.com/search?q=repo%3Abrain-score%2Fmodel-tools%20channel_x&type=code + +""" diff --git a/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/requirements.txt b/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/requirements.txt new file mode 100644 index 000000000..b71f00d65 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/requirements.txt @@ -0,0 +1,4 @@ +numpy +torch +transformers==4.30.2 +pillow diff --git a/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/test.py b/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/test.py new file mode 100644 index 000000000..8efcb1fa8 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_13_384_in22k_finetuned_in1k_4/test.py @@ -0,0 +1,8 @@ +import pytest +import brainscore_vision + + +@pytest.mark.travis_slow +def test_has_identifier(): + model = brainscore_vision.load_model('cvt_cvt-13-384-in22k_finetuned-in1k_4_LucyV4') + assert model.identifier == 'cvt_cvt-13-384-in22k_finetuned-in1k_4_LucyV4' \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_21_224_in1k_4/__init__.py b/brainscore_vision/models/cvt_cvt_21_224_in1k_4/__init__.py new file mode 100644 index 000000000..928c408f4 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_224_in1k_4/__init__.py @@ -0,0 +1,9 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from .model import get_model, get_layers + + +model_registry['cvt_cvt-21-224-in1k_4_LucyV4'] = \ + lambda: ModelCommitment(identifier='cvt_cvt-21-224-in1k_4_LucyV4', + activations_model=get_model('cvt_cvt-21-224-in1k_4_LucyV4'), + layers=get_layers('cvt_cvt-21-224-in1k_4_LucyV4')) \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_21_224_in1k_4/model.py b/brainscore_vision/models/cvt_cvt_21_224_in1k_4/model.py new file mode 100644 index 000000000..88597fb1c --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_224_in1k_4/model.py @@ -0,0 +1,142 @@ +from brainscore_vision.model_helpers.check_submission import check_models +import functools +from transformers import AutoFeatureExtractor, CvtForImageClassification +from brainscore_vision.model_helpers.activations.pytorch import PytorchWrapper +from PIL import Image +import numpy as np +import torch + +""" +Template module for a base model submission to brain-score +""" + + +def get_model(name): + assert name == 'cvt_cvt-21-224-in1k_4_LucyV4' + # https://huggingface.co/models?sort=downloads&search=cvt + image_size = 224 + processor = AutoFeatureExtractor.from_pretrained('microsoft/cvt-21') + model = CvtForImageClassification.from_pretrained('microsoft/cvt-21') + preprocessing = functools.partial(load_preprocess_images, processor=processor, image_size=image_size) + wrapper = PytorchWrapper(identifier=name, model=model, preprocessing=preprocessing) + wrapper.image_size = image_size + + return wrapper + + +def get_layers(name): + assert name == 'cvt_cvt-21-224-in1k_4_LucyV4' + layers = [] + layers += [f'cvt.encoder.stages.0.layers.{i}' for i in range(1)] + layers += [f'cvt.encoder.stages.1.layers.{i}' for i in range(4)] + layers += [f'cvt.encoder.stages.2.layers.{i}' for i in range(16)] + layers += ['layernorm'] + return layers + + +def get_bibtex(model_identifier): + """ + A method returning the bibtex reference of the requested model as a string. + """ + return """@misc{wu2021cvtintroducingconvolutionsvision, + title={CvT: Introducing Convolutions to Vision Transformers}, + author={Haiping Wu and Bin Xiao and Noel Codella and Mengchen Liu and Xiyang Dai and Lu Yuan and Lei Zhang}, + year={2021}, + eprint={2103.15808}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2103.15808}, + }""" + + +def load_preprocess_images(image_filepaths, image_size, processor=None, **kwargs): + images = load_images(image_filepaths) + # images = [, ...] + images = [image.resize((image_size, image_size)) for image in images] + if processor is not None: + images = [processor(images=image, return_tensors="pt", **kwargs) for image in images] + if len(images[0].keys()) != 1: + raise NotImplementedError(f'unknown processor for getting model {processor}') + assert list(images[0].keys())[0] == 'pixel_values' + images = [image['pixel_values'] for image in images] + images = torch.cat(images) + images = images.cpu().numpy() + else: + images = preprocess_images(images, image_size=image_size, **kwargs) + return images + + +def load_images(image_filepaths): + return [load_image(image_filepath) for image_filepath in image_filepaths] + + +def load_image(image_filepath): + with Image.open(image_filepath) as pil_image: + if 'L' not in pil_image.mode.upper() and 'A' not in pil_image.mode.upper() \ + and 'P' not in pil_image.mode.upper(): # not binary and not alpha and not palletized + # work around to https://github.com/python-pillow/Pillow/issues/1144, + # see https://stackoverflow.com/a/30376272/2225200 + return pil_image.copy() + else: # make sure potential binary images are in RGB + rgb_image = Image.new("RGB", pil_image.size) + rgb_image.paste(pil_image) + return rgb_image + + +def preprocess_images(images, image_size, **kwargs): + preprocess = torchvision_preprocess_input(image_size, **kwargs) + images = [preprocess(image) for image in images] + images = np.concatenate(images) + return images + + +def torchvision_preprocess_input(image_size, **kwargs): + from torchvision import transforms + return transforms.Compose([ + transforms.Resize((image_size, image_size)), + torchvision_preprocess(**kwargs), + ]) + + +def torchvision_preprocess(normalize_mean=(0.485, 0.456, 0.406), normalize_std=(0.229, 0.224, 0.225)): + from torchvision import transforms + return transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=normalize_mean, std=normalize_std), + lambda img: img.unsqueeze(0) + ]) + + +def create_static_video(image, num_frames, normalize_0to1=False, channel_dim=3): + ''' + Create a static video with the same image in all frames. + Args: + image (PIL.Image.Image): Input image. + num_frames (int): Number of frames in the video. + Returns: + result (np.ndarray): np array of frames of shape (num_frames, height, width, 3). + ''' + frames = [] + for _ in range(num_frames): + frame = np.array(image) + if normalize_0to1: + frame = frame / 255. + if channel_dim == 1: + frame = frame.transpose(2, 0, 1) + frames.append(frame) + return np.stack(frames) + + +if __name__ == '__main__': + # Use this method to ensure the correctness of the BaseModel implementations. + # It executes a mock run of brain-score benchmarks. + check_models.check_base_models(__name__) + +""" +Notes on the error: + +- 'channel_x' key error: +# 'embeddings.patch_embeddings.projection', +https://github.com/search?q=repo%3Abrain-score%2Fmodel-tools%20channel_x&type=code + +""" diff --git a/brainscore_vision/models/cvt_cvt_21_224_in1k_4/requirements.txt b/brainscore_vision/models/cvt_cvt_21_224_in1k_4/requirements.txt new file mode 100644 index 000000000..b71f00d65 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_224_in1k_4/requirements.txt @@ -0,0 +1,4 @@ +numpy +torch +transformers==4.30.2 +pillow diff --git a/brainscore_vision/models/cvt_cvt_21_224_in1k_4/test.py b/brainscore_vision/models/cvt_cvt_21_224_in1k_4/test.py new file mode 100644 index 000000000..1bd6f70d1 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_224_in1k_4/test.py @@ -0,0 +1,8 @@ +import pytest +import brainscore_vision + + +@pytest.mark.travis_slow +def test_has_identifier(): + model = brainscore_vision.load_model('cvt_cvt-21-224-in1k_4_LucyV4') + assert model.identifier == 'cvt_cvt-21-224-in1k_4_LucyV4' \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_21_384_in1k_4/__init__.py b/brainscore_vision/models/cvt_cvt_21_384_in1k_4/__init__.py new file mode 100644 index 000000000..9c1406f08 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_384_in1k_4/__init__.py @@ -0,0 +1,9 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from .model import get_model, get_layers + + +model_registry['cvt_cvt-21-384-in1k_4_LucyV4'] = \ + lambda: ModelCommitment(identifier='cvt_cvt-21-384-in1k_4_LucyV4', + activations_model=get_model('cvt_cvt-21-384-in1k_4_LucyV4'), + layers=get_layers('cvt_cvt-21-384-in1k_4_LucyV4')) diff --git a/brainscore_vision/models/cvt_cvt_21_384_in1k_4/model.py b/brainscore_vision/models/cvt_cvt_21_384_in1k_4/model.py new file mode 100644 index 000000000..d4b90d6cd --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_384_in1k_4/model.py @@ -0,0 +1,142 @@ +from brainscore_vision.model_helpers.check_submission import check_models +import functools +from transformers import AutoFeatureExtractor, CvtForImageClassification +from brainscore_vision.model_helpers.activations.pytorch import PytorchWrapper +from PIL import Image +import numpy as np +import torch + +""" +Template module for a base model submission to brain-score +""" + + +def get_model(name): + assert name == 'cvt_cvt-21-384-in1k_4_LucyV4' + # https://huggingface.co/models?sort=downloads&search=cvt + image_size = 384 + processor = AutoFeatureExtractor.from_pretrained('microsoft/cvt-21-384-1k') + model = CvtForImageClassification.from_pretrained('microsoft/cvt-21-384-1k') + preprocessing = functools.partial(load_preprocess_images, processor=processor, image_size=image_size) + wrapper = PytorchWrapper(identifier=name, model=model, preprocessing=preprocessing) + wrapper.image_size = image_size + + return wrapper + + +def get_layers(name): + assert name == 'cvt_cvt-21-384-in1k_4_LucyV4' + layers = [] + layers += [f'cvt.encoder.stages.0.layers.{i}' for i in range(1)] + layers += [f'cvt.encoder.stages.1.layers.{i}' for i in range(4)] + layers += [f'cvt.encoder.stages.2.layers.{i}' for i in range(16)] + layers += ['layernorm'] + return layers + + +def get_bibtex(model_identifier): + """ + A method returning the bibtex reference of the requested model as a string. + """ + return """@misc{wu2021cvtintroducingconvolutionsvision, + title={CvT: Introducing Convolutions to Vision Transformers}, + author={Haiping Wu and Bin Xiao and Noel Codella and Mengchen Liu and Xiyang Dai and Lu Yuan and Lei Zhang}, + year={2021}, + eprint={2103.15808}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2103.15808}, + }""" + + +def load_preprocess_images(image_filepaths, image_size, processor=None, **kwargs): + images = load_images(image_filepaths) + # images = [, ...] + images = [image.resize((image_size, image_size)) for image in images] + if processor is not None: + images = [processor(images=image, return_tensors="pt", **kwargs) for image in images] + if len(images[0].keys()) != 1: + raise NotImplementedError(f'unknown processor for getting model {processor}') + assert list(images[0].keys())[0] == 'pixel_values' + images = [image['pixel_values'] for image in images] + images = torch.cat(images) + images = images.cpu().numpy() + else: + images = preprocess_images(images, image_size=image_size, **kwargs) + return images + + +def load_images(image_filepaths): + return [load_image(image_filepath) for image_filepath in image_filepaths] + + +def load_image(image_filepath): + with Image.open(image_filepath) as pil_image: + if 'L' not in pil_image.mode.upper() and 'A' not in pil_image.mode.upper() \ + and 'P' not in pil_image.mode.upper(): # not binary and not alpha and not palletized + # work around to https://github.com/python-pillow/Pillow/issues/1144, + # see https://stackoverflow.com/a/30376272/2225200 + return pil_image.copy() + else: # make sure potential binary images are in RGB + rgb_image = Image.new("RGB", pil_image.size) + rgb_image.paste(pil_image) + return rgb_image + + +def preprocess_images(images, image_size, **kwargs): + preprocess = torchvision_preprocess_input(image_size, **kwargs) + images = [preprocess(image) for image in images] + images = np.concatenate(images) + return images + + +def torchvision_preprocess_input(image_size, **kwargs): + from torchvision import transforms + return transforms.Compose([ + transforms.Resize((image_size, image_size)), + torchvision_preprocess(**kwargs), + ]) + + +def torchvision_preprocess(normalize_mean=(0.485, 0.456, 0.406), normalize_std=(0.229, 0.224, 0.225)): + from torchvision import transforms + return transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=normalize_mean, std=normalize_std), + lambda img: img.unsqueeze(0) + ]) + + +def create_static_video(image, num_frames, normalize_0to1=False, channel_dim=3): + ''' + Create a static video with the same image in all frames. + Args: + image (PIL.Image.Image): Input image. + num_frames (int): Number of frames in the video. + Returns: + result (np.ndarray): np array of frames of shape (num_frames, height, width, 3). + ''' + frames = [] + for _ in range(num_frames): + frame = np.array(image) + if normalize_0to1: + frame = frame / 255. + if channel_dim == 1: + frame = frame.transpose(2, 0, 1) + frames.append(frame) + return np.stack(frames) + + +if __name__ == '__main__': + # Use this method to ensure the correctness of the BaseModel implementations. + # It executes a mock run of brain-score benchmarks. + check_models.check_base_models(__name__) + +""" +Notes on the error: + +- 'channel_x' key error: +# 'embeddings.patch_embeddings.projection', +https://github.com/search?q=repo%3Abrain-score%2Fmodel-tools%20channel_x&type=code + +""" diff --git a/brainscore_vision/models/cvt_cvt_21_384_in1k_4/requirements.txt b/brainscore_vision/models/cvt_cvt_21_384_in1k_4/requirements.txt new file mode 100644 index 000000000..b71f00d65 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_384_in1k_4/requirements.txt @@ -0,0 +1,4 @@ +numpy +torch +transformers==4.30.2 +pillow diff --git a/brainscore_vision/models/cvt_cvt_21_384_in1k_4/test.py b/brainscore_vision/models/cvt_cvt_21_384_in1k_4/test.py new file mode 100644 index 000000000..92f89cc52 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_384_in1k_4/test.py @@ -0,0 +1,8 @@ +import pytest +import brainscore_vision + + +@pytest.mark.travis_slow +def test_has_identifier(): + model = brainscore_vision.load_model('cvt_cvt-21-384-in1k_4_LucyV4') + assert model.identifier == 'cvt_cvt-21-384-in1k_4_LucyV4' \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/__init__.py b/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/__init__.py new file mode 100644 index 000000000..7905a73b2 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/__init__.py @@ -0,0 +1,9 @@ +from brainscore_vision import model_registry +from brainscore_vision.model_helpers.brain_transformation import ModelCommitment +from .model import get_model, get_layers + + +model_registry['cvt_cvt-21-384-in22k_finetuned-in1k_4_LucyV4'] = \ + lambda: ModelCommitment(identifier='cvt_cvt-21-384-in22k_finetuned-in1k_4_LucyV4', + activations_model=get_model('cvt_cvt-21-384-in22k_finetuned-in1k_4_LucyV4'), + layers=get_layers('cvt_cvt-21-384-in22k_finetuned-in1k_4_LucyV4')) \ No newline at end of file diff --git a/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/model.py b/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/model.py new file mode 100644 index 000000000..ad0ee0e6d --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/model.py @@ -0,0 +1,142 @@ +from brainscore_vision.model_helpers.check_submission import check_models +import functools +from transformers import AutoFeatureExtractor, CvtForImageClassification +from brainscore_vision.model_helpers.activations.pytorch import PytorchWrapper +from PIL import Image +import numpy as np +import torch + +""" +Template module for a base model submission to brain-score +""" + + +def get_model(name): + assert name == 'cvt_cvt-21-384-in22k_finetuned-in1k_4_LucyV4' + # https://huggingface.co/models?sort=downloads&search=cvt + image_size = 384 + processor = AutoFeatureExtractor.from_pretrained('microsoft/cvt-21-384-22k') + model = CvtForImageClassification.from_pretrained('microsoft/cvt-21-384-22k') + preprocessing = functools.partial(load_preprocess_images, processor=processor, image_size=image_size) + wrapper = PytorchWrapper(identifier=name, model=model, preprocessing=preprocessing) + wrapper.image_size = image_size + + return wrapper + + +def get_layers(name): + assert name == 'cvt_cvt-21-384-in22k_finetuned-in1k_4_LucyV4' + layers = [] + layers += [f'cvt.encoder.stages.0.layers.{i}' for i in range(1)] + layers += [f'cvt.encoder.stages.1.layers.{i}' for i in range(4)] + layers += [f'cvt.encoder.stages.2.layers.{i}' for i in range(16)] + layers += ['layernorm'] + return layers + + +def get_bibtex(model_identifier): + """ + A method returning the bibtex reference of the requested model as a string. + """ + return """@misc{wu2021cvtintroducingconvolutionsvision, + title={CvT: Introducing Convolutions to Vision Transformers}, + author={Haiping Wu and Bin Xiao and Noel Codella and Mengchen Liu and Xiyang Dai and Lu Yuan and Lei Zhang}, + year={2021}, + eprint={2103.15808}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2103.15808}, + }""" + + +def load_preprocess_images(image_filepaths, image_size, processor=None, **kwargs): + images = load_images(image_filepaths) + # images = [, ...] + images = [image.resize((image_size, image_size)) for image in images] + if processor is not None: + images = [processor(images=image, return_tensors="pt", **kwargs) for image in images] + if len(images[0].keys()) != 1: + raise NotImplementedError(f'unknown processor for getting model {processor}') + assert list(images[0].keys())[0] == 'pixel_values' + images = [image['pixel_values'] for image in images] + images = torch.cat(images) + images = images.cpu().numpy() + else: + images = preprocess_images(images, image_size=image_size, **kwargs) + return images + + +def load_images(image_filepaths): + return [load_image(image_filepath) for image_filepath in image_filepaths] + + +def load_image(image_filepath): + with Image.open(image_filepath) as pil_image: + if 'L' not in pil_image.mode.upper() and 'A' not in pil_image.mode.upper() \ + and 'P' not in pil_image.mode.upper(): # not binary and not alpha and not palletized + # work around to https://github.com/python-pillow/Pillow/issues/1144, + # see https://stackoverflow.com/a/30376272/2225200 + return pil_image.copy() + else: # make sure potential binary images are in RGB + rgb_image = Image.new("RGB", pil_image.size) + rgb_image.paste(pil_image) + return rgb_image + + +def preprocess_images(images, image_size, **kwargs): + preprocess = torchvision_preprocess_input(image_size, **kwargs) + images = [preprocess(image) for image in images] + images = np.concatenate(images) + return images + + +def torchvision_preprocess_input(image_size, **kwargs): + from torchvision import transforms + return transforms.Compose([ + transforms.Resize((image_size, image_size)), + torchvision_preprocess(**kwargs), + ]) + + +def torchvision_preprocess(normalize_mean=(0.485, 0.456, 0.406), normalize_std=(0.229, 0.224, 0.225)): + from torchvision import transforms + return transforms.Compose([ + transforms.ToTensor(), + transforms.Normalize(mean=normalize_mean, std=normalize_std), + lambda img: img.unsqueeze(0) + ]) + + +def create_static_video(image, num_frames, normalize_0to1=False, channel_dim=3): + ''' + Create a static video with the same image in all frames. + Args: + image (PIL.Image.Image): Input image. + num_frames (int): Number of frames in the video. + Returns: + result (np.ndarray): np array of frames of shape (num_frames, height, width, 3). + ''' + frames = [] + for _ in range(num_frames): + frame = np.array(image) + if normalize_0to1: + frame = frame / 255. + if channel_dim == 1: + frame = frame.transpose(2, 0, 1) + frames.append(frame) + return np.stack(frames) + + +if __name__ == '__main__': + # Use this method to ensure the correctness of the BaseModel implementations. + # It executes a mock run of brain-score benchmarks. + check_models.check_base_models(__name__) + +""" +Notes on the error: + +- 'channel_x' key error: +# 'embeddings.patch_embeddings.projection', +https://github.com/search?q=repo%3Abrain-score%2Fmodel-tools%20channel_x&type=code + +""" diff --git a/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/requirements.txt b/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/requirements.txt new file mode 100644 index 000000000..b71f00d65 --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/requirements.txt @@ -0,0 +1,4 @@ +numpy +torch +transformers==4.30.2 +pillow diff --git a/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/test.py b/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/test.py new file mode 100644 index 000000000..304916c3d --- /dev/null +++ b/brainscore_vision/models/cvt_cvt_21_384_in22k_finetuned_in1k_4/test.py @@ -0,0 +1,8 @@ +import pytest +import brainscore_vision + + +@pytest.mark.travis_slow +def test_has_identifier(): + model = brainscore_vision.load_model('cvt_cvt-21-384-in22k_finetuned-in1k_4_LucyV4') + assert model.identifier == 'cvt_cvt-21-384-in22k_finetuned-in1k_4_LucyV4' \ No newline at end of file