From e2949215a5a8154afa18f932b786a769007d8101 Mon Sep 17 00:00:00 2001 From: Martin Schrimpf Date: Sun, 21 Mar 2021 12:02:57 -0400 Subject: [PATCH] add ImageNet train, add .val suffix to previous --- brainio_collection/lookup.csv | 4 +- .../fei-fei/deng2009imagenet_train.py | 65 +++++++++++++++++++ tests/test_stimuli.py | 25 +++++-- 3 files changed, 85 insertions(+), 9 deletions(-) create mode 100644 brainio_contrib/fei-fei/deng2009imagenet_train.py diff --git a/brainio_collection/lookup.csv b/brainio_collection/lookup.csv index d26c8ad..9fe5380 100644 --- a/brainio_collection/lookup.csv +++ b/brainio_collection/lookup.csv @@ -220,8 +220,8 @@ dietterich.Hendrycks2019.jpeg_compression_5,stimulus_set,,S3,https://brainio-con dicarlo.Rajalingham2020,stimulus_set,StimulusSet,S3,https://brainio.dicarlo.s3.amazonaws.com/image_dicarlo_Rajalingham2020.csv,9a9a6b3115d2d8ce5d54ec2522093d8a87ed13a0, dicarlo.Rajalingham2020,stimulus_set,,S3,https://brainio.dicarlo.s3.amazonaws.com/image_dicarlo_Rajalingham2020.zip,6097086901032e20f8ae764e9cc06e0a891a3e18, dicarlo.Rajalingham2020,assembly,NeuronRecordingAssembly,S3,https://brainio.dicarlo.s3.amazonaws.com/assy_dicarlo_Rajalingham2020.nc,ab95ae6c9907438f87b9b13b238244049f588680,dicarlo.Rajalingham2020 -fei-fei.Deng2009,stimulus_set,StimulusSet,S3,https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009.csv,ff79dcf6b0d115e6e8aa8d0fbba3af11dc649e57, -fei-fei.Deng2009,stimulus_set,,S3,https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009.zip,78172d752d8216a00833cfa34be67c8532ad7330, +fei-fei.Deng2009.val,stimulus_set,StimulusSet,S3,https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009.csv,ff79dcf6b0d115e6e8aa8d0fbba3af11dc649e57, +fei-fei.Deng2009.val,stimulus_set,,S3,https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009.zip,78172d752d8216a00833cfa34be67c8532ad7330, dicarlo.Seibert2019,assembly,NeuronRecordingAssembly,S3,https://brainio.dicarlo.s3.amazonaws.com/assy_dicarlo_Seibert2019.nc,eef41bb1f3d83c0e60ebf0e91511ce71ef5fee32,dicarlo.hvm aru.Kuzovkin2018,stimulus_set,StimulusSet,S3,https://brainio.contrib.s3.amazonaws.com/image_aru_Kuzovkin2018.csv,a5990b24aea3e453756141cbe69a83304db72d0b, aru.Kuzovkin2018,stimulus_set,,S3,https://brainio.contrib.s3.amazonaws.com/image_aru_Kuzovkin2018.zip,cca4d819d7743bdd4bf65c1cb2439fd0ec97543a, diff --git a/brainio_contrib/fei-fei/deng2009imagenet_train.py b/brainio_contrib/fei-fei/deng2009imagenet_train.py new file mode 100644 index 0000000..e279e7f --- /dev/null +++ b/brainio_contrib/fei-fei/deng2009imagenet_train.py @@ -0,0 +1,65 @@ +import logging +import numpy as np +import os +import sys +from pathlib import Path +from tqdm import tqdm + +from brainio_base.stimuli import StimulusSet +from brainio_collection.lookup import sha1_hash +from brainio_collection.packaging import create_image_csv, upload_to_s3 + + +def collect_stimuli(stimuli_dir): + files = stimuli_dir.glob('*/*') + stimulus_set = [] + for file in tqdm(files, desc='files', total=1_281_167): + synset = file.parent.name + stimulus_set.append({ + 'synset': synset, + 'image_id': file.name, + 'filename': file.name, + 'filepath': file, + 'relative_path': file.parent.name + '/' + file.name, + 'sha1': sha1_hash(file), + }) + stimulus_set = StimulusSet(stimulus_set) + stimulus_set.image_paths = {row.image_id: row.filepath for row in stimulus_set.itertuples()} + stimulus_set['split'] = 'train' + stimulus_set['image_path_within_store'] = stimulus_set['filename'].apply( + lambda filename: os.path.splitext(filename)[0]) + assert len(stimulus_set) == 1_281_167 + assert len(np.unique(stimulus_set['image_id'])) == len(stimulus_set), "duplicate image_ids" + assert len(np.unique(stimulus_set['sha1'])) == 1_275_232 # lots of duplicates apparently + assert len(np.unique(stimulus_set['synset'])) == 1_000 + del stimulus_set['filepath'] + return stimulus_set + + +def main(): + stimuli_dir = Path('/braintree/data2/active/common/imagenet_raw/train') + assert stimuli_dir.is_dir() + + stimuli = collect_stimuli(stimuli_dir) + identifier = 'fei-fei.Deng2009.train' + + # Only package the csv, not the image zip. The full training set is 140G so we do not want to store on + # or retrieve from S3. Instead, we store the csv with metadata on S3 and then use locally stored image files. + # We use excerpts from + # https://github.com/brain-score/brainio_collection/blob/992ae550d38681843cabfc37509d540acc44c8f6/brainio_collection/packaging.py#L141-L161 + # to package the csv (and not the image zip). + print('Packaging csv') + bucket_name = 'brainio.contrib' + image_store_identifier = "image_" + identifier.replace(".", "_") + csv_file_name = image_store_identifier + ".csv" + target_csv_path = Path(__file__).parent / csv_file_name + csv_sha1 = create_image_csv(stimuli, str(target_csv_path)) + print(f"CSV sha1: {csv_sha1}") + upload_to_s3(str(target_csv_path), bucket_name, target_s3_key=csv_file_name) + + +if __name__ == '__main__': + logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + for logger_ignore in ['urllib3', 'botocore', 'boto3', 's3transfer']: + logging.getLogger(logger_ignore).setLevel(logging.INFO) + main() diff --git a/tests/test_stimuli.py b/tests/test_stimuli.py index b2e5448..fb24344 100644 --- a/tests/test_stimuli.py +++ b/tests/test_stimuli.py @@ -1,10 +1,11 @@ -import os - import imageio import numpy as np +import os +import pandas as pd import pytest import brainio_collection +from brainio_collection.fetch import fetch_file def test_get_stimulus_set(): @@ -56,7 +57,7 @@ def test_dicarlohvm(self): 'dicarlo.THINGS1', 'dicarlo.THINGS2', 'aru.Kuzovkin2018', - 'fei-fei.Deng2009', + 'fei-fei.Deng2009.val', 'aru.Cichy2019', 'dicarlo.BashivanKar2019.naturalistic', 'dicarlo.BashivanKar2019.synthetic' @@ -79,7 +80,17 @@ def test_klab_Zhang2018search(): @pytest.mark.private_access -def test_feifei_Deng2009(): - stimulus_set = brainio_collection.get_stimulus_set('fei-fei.Deng2009') - assert len(stimulus_set) == 50_000 - assert len(set(stimulus_set['label'])) == 1_000 +class TestFeiFeiDeng2009: + def test_val(self): + stimulus_set = brainio_collection.get_stimulus_set('fei-fei.Deng2009.val') + assert len(stimulus_set) == 50_000 + assert len(set(stimulus_set['label'])) == 1_000 + + def test_train(self): + # To preserve bandwidth and space (140G), we only store the metadata for ImageNet train and keep files local. + csv_path = fetch_file(location_type='S3', + location='https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009_train.csv', + sha1='be793cd12a4e3ccffe17a2499e99d2125c1db4c5') # from packaging + stimulus_set = pd.read_csv(csv_path) + assert len(stimulus_set) == 1_281_167 + assert len(set(stimulus_set['synset'])) == 1_000