Skip to content
This repository has been archived by the owner on Jul 15, 2021. It is now read-only.

add ImageNet train, add .val suffix to previous #61

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions brainio_collection/lookup.csv
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,8 @@ dietterich.Hendrycks2019.jpeg_compression_5,stimulus_set,,S3,https://brainio-con
dicarlo.Rajalingham2020,stimulus_set,StimulusSet,S3,https://brainio.dicarlo.s3.amazonaws.com/image_dicarlo_Rajalingham2020.csv,9a9a6b3115d2d8ce5d54ec2522093d8a87ed13a0,
dicarlo.Rajalingham2020,stimulus_set,,S3,https://brainio.dicarlo.s3.amazonaws.com/image_dicarlo_Rajalingham2020.zip,6097086901032e20f8ae764e9cc06e0a891a3e18,
dicarlo.Rajalingham2020,assembly,NeuronRecordingAssembly,S3,https://brainio.dicarlo.s3.amazonaws.com/assy_dicarlo_Rajalingham2020.nc,ab95ae6c9907438f87b9b13b238244049f588680,dicarlo.Rajalingham2020
fei-fei.Deng2009,stimulus_set,StimulusSet,S3,https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009.csv,ff79dcf6b0d115e6e8aa8d0fbba3af11dc649e57,
fei-fei.Deng2009,stimulus_set,,S3,https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009.zip,78172d752d8216a00833cfa34be67c8532ad7330,
fei-fei.Deng2009.val,stimulus_set,StimulusSet,S3,https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009.csv,ff79dcf6b0d115e6e8aa8d0fbba3af11dc649e57,
fei-fei.Deng2009.val,stimulus_set,,S3,https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009.zip,78172d752d8216a00833cfa34be67c8532ad7330,
dicarlo.Seibert2019,assembly,NeuronRecordingAssembly,S3,https://brainio.dicarlo.s3.amazonaws.com/assy_dicarlo_Seibert2019.nc,eef41bb1f3d83c0e60ebf0e91511ce71ef5fee32,dicarlo.hvm
aru.Kuzovkin2018,stimulus_set,StimulusSet,S3,https://brainio.contrib.s3.amazonaws.com/image_aru_Kuzovkin2018.csv,a5990b24aea3e453756141cbe69a83304db72d0b,
aru.Kuzovkin2018,stimulus_set,,S3,https://brainio.contrib.s3.amazonaws.com/image_aru_Kuzovkin2018.zip,cca4d819d7743bdd4bf65c1cb2439fd0ec97543a,
Expand Down
65 changes: 65 additions & 0 deletions brainio_contrib/fei-fei/deng2009imagenet_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
import logging
import numpy as np
import os
import sys
from pathlib import Path
from tqdm import tqdm

from brainio_base.stimuli import StimulusSet
from brainio_collection.lookup import sha1_hash
from brainio_collection.packaging import create_image_csv, upload_to_s3


def collect_stimuli(stimuli_dir):
files = stimuli_dir.glob('*/*')
stimulus_set = []
for file in tqdm(files, desc='files', total=1_281_167):
synset = file.parent.name
stimulus_set.append({
'synset': synset,
'image_id': file.name,
'filename': file.name,
'filepath': file,
'relative_path': file.parent.name + '/' + file.name,
'sha1': sha1_hash(file),
})
stimulus_set = StimulusSet(stimulus_set)
stimulus_set.image_paths = {row.image_id: row.filepath for row in stimulus_set.itertuples()}
stimulus_set['split'] = 'train'
stimulus_set['image_path_within_store'] = stimulus_set['filename'].apply(
lambda filename: os.path.splitext(filename)[0])
assert len(stimulus_set) == 1_281_167
assert len(np.unique(stimulus_set['image_id'])) == len(stimulus_set), "duplicate image_ids"
assert len(np.unique(stimulus_set['sha1'])) == 1_275_232 # lots of duplicates apparently
assert len(np.unique(stimulus_set['synset'])) == 1_000
del stimulus_set['filepath']
return stimulus_set


def main():
stimuli_dir = Path('/braintree/data2/active/common/imagenet_raw/train')
assert stimuli_dir.is_dir()

stimuli = collect_stimuli(stimuli_dir)
identifier = 'fei-fei.Deng2009.train'

# Only package the csv, not the image zip. The full training set is 140G so we do not want to store on
# or retrieve from S3. Instead, we store the csv with metadata on S3 and then use locally stored image files.
# We use excerpts from
# https://github.com/brain-score/brainio_collection/blob/992ae550d38681843cabfc37509d540acc44c8f6/brainio_collection/packaging.py#L141-L161
# to package the csv (and not the image zip).
print('Packaging csv')
bucket_name = 'brainio.contrib'
image_store_identifier = "image_" + identifier.replace(".", "_")
csv_file_name = image_store_identifier + ".csv"
target_csv_path = Path(__file__).parent / csv_file_name
csv_sha1 = create_image_csv(stimuli, str(target_csv_path))
print(f"CSV sha1: {csv_sha1}")
upload_to_s3(str(target_csv_path), bucket_name, target_s3_key=csv_file_name)


if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
for logger_ignore in ['urllib3', 'botocore', 'boto3', 's3transfer']:
logging.getLogger(logger_ignore).setLevel(logging.INFO)
main()
25 changes: 18 additions & 7 deletions tests/test_stimuli.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import os

import imageio
import numpy as np
import os
import pandas as pd
import pytest

import brainio_collection
from brainio_collection.fetch import fetch_file


def test_get_stimulus_set():
Expand Down Expand Up @@ -56,7 +57,7 @@ def test_dicarlohvm(self):
'dicarlo.THINGS1',
'dicarlo.THINGS2',
'aru.Kuzovkin2018',
'fei-fei.Deng2009',
'fei-fei.Deng2009.val',
'aru.Cichy2019',
'dicarlo.BashivanKar2019.naturalistic',
'dicarlo.BashivanKar2019.synthetic'
Expand All @@ -79,7 +80,17 @@ def test_klab_Zhang2018search():


@pytest.mark.private_access
def test_feifei_Deng2009():
stimulus_set = brainio_collection.get_stimulus_set('fei-fei.Deng2009')
assert len(stimulus_set) == 50_000
assert len(set(stimulus_set['label'])) == 1_000
class TestFeiFeiDeng2009:
def test_val(self):
stimulus_set = brainio_collection.get_stimulus_set('fei-fei.Deng2009.val')
assert len(stimulus_set) == 50_000
assert len(set(stimulus_set['label'])) == 1_000

def test_train(self):
# To preserve bandwidth and space (140G), we only store the metadata for ImageNet train and keep files local.
csv_path = fetch_file(location_type='S3',
location='https://brainio.contrib.s3.amazonaws.com/image_fei-fei_Deng2009_train.csv',
sha1='be793cd12a4e3ccffe17a2499e99d2125c1db4c5') # from packaging
stimulus_set = pd.read_csv(csv_path)
assert len(stimulus_set) == 1_281_167
assert len(set(stimulus_set['synset'])) == 1_000