Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bugfix hdf5 filenames for substimuli #52

Merged
merged 2 commits into from
Mar 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 13 additions & 3 deletions pysaliency/datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -1156,12 +1156,22 @@ def __init__(self, stimuli, attributes=None):
def __len__(self):
return len(self.stimuli)

def _get_attribute_for_stimulus_subset(self, index):
sub_attributes = {}
for attribute_name, attribute_value in self.attributes.items():
if isinstance(index, (list, np.ndarray)) and not isinstance(attribute_value, np.ndarray):
sub_attributes[attribute_name] = [attribute_value[i] for i in index]
else:
sub_attributes[attribute_name] = attribute_value[index]

return sub_attributes

def __getitem__(self, index):
if isinstance(index, slice):
attributes = {key: value[index] for key, value in self.attributes.items()}
attributes = self._get_attribute_for_stimulus_subset(index)
return ObjectStimuli([self.stimulus_objects[i] for i in range(len(self))[index]], attributes=attributes)
elif isinstance(index, list):
attributes = {key: value[index] for key, value in self.attributes.items()}
attributes = self._get_attribute_for_stimulus_subset(index)
return ObjectStimuli([self.stimulus_objects[i] for i in index], attributes=attributes)
else:
return self.stimulus_objects[index]
Expand Down Expand Up @@ -1337,7 +1347,7 @@ def __getitem__(self, index):
if isinstance(index, (list, np.ndarray)):
filenames = [self.filenames[i] for i in index]
shapes = [self.shapes[i] for i in index]
attributes = {key: [value[i] for i in index] for key, value in self.attributes.items()}
attributes = self._get_attribute_for_stimulus_subset(index)
return type(self)(filenames=filenames, shapes=shapes, attributes=attributes, cached=self.cached)
else:
return self.stimulus_objects[index]
Expand Down
77 changes: 52 additions & 25 deletions pysaliency/precomputed_models.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from __future__ import print_function, division, absolute_import
from __future__ import absolute_import, division, print_function

import glob
import os.path
Expand All @@ -8,14 +8,14 @@

import numpy as np
from imageio import imread
from scipy.special import logsumexp
from scipy.io import loadmat
from scipy.special import logsumexp
from tqdm import tqdm

from .datasets import FileStimuli, get_image_hash
from .models import Model
from .saliency_map_models import SaliencyMapModel
from .datasets import get_image_hash, FileStimuli
from .utils import get_minimal_unique_filenames
from .utils import full_split, get_minimal_unique_filenames


def get_stimuli_filenames(stimuli):
Expand All @@ -28,6 +28,44 @@ def get_stimuli_filenames(stimuli):
return stimuli.filenames


def get_keys_from_filenames(filenames, keys):
"""checks how much filenames have to be shorted to get the correct hdf5 or other keys"""
first_filename_parts = full_split(filenames[0])
for part_index in range(len(first_filename_parts)):
remaining_filename = os.path.join(*first_filename_parts[part_index:])
if remaining_filename in keys:
break
else:
raise ValueError('No common prefix found from {}'.format(filenames[0]))

filename_keys = []
for filename in filenames:
filename_parts = full_split(filename)
remaining_filename = os.path.join(*filename_parts[part_index:])
filename_keys.append(remaining_filename)

return filename_keys


def get_keys_from_filenames_with_prefix(filenames, keys):
"""checks how much filenames have to be shorted to get the correct hdf5 or other keys, where the keys might have a prefix"""
first_key_parts = full_split(keys[0])

for key_part_index in range(len(first_key_parts)):
remaining_keys = [os.path.join(*full_split(key)[key_part_index:]) for key in keys]
try:
filename_keys = get_keys_from_filenames(filenames, remaining_keys)
except ValueError:
continue
else:
full_filename_keys = []
for key, filename_key in zip(keys, filename_keys):
full_filename_keys.append(os.path.join(*full_split(key)[:key_part_index], filename_key))
return full_filename_keys

raise ValueError('No common prefix found from {} and {}'.format(filenames[0], keys[0]))


def export_model_to_hdf5(model, stimuli, filename, compression=9, overwrite=True, flush=False):
"""Export pysaliency model predictions for stimuli into hdf5 file

Expand Down Expand Up @@ -83,8 +121,8 @@ def _file_for_stimulus(self, stimulus):

try:
stimulus_index = self.stimuli.stimulus_ids.index(stimulus_id)
except IndexError:
raise IndexError("Stimulus id '{}' not found in stimuli!".format(stimulus_id))
except IndexError as exc:
raise IndexError("Stimulus id '{}' not found in stimuli!".format(stimulus_id)) from exc

return self.files[stimulus_index]

Expand Down Expand Up @@ -114,8 +152,8 @@ def __init__(self, stimuli, directory, **kwargs):
files = [os.path.relpath(filename, start=directory) for filename in glob.glob(os.path.join(directory, '**', '*'), recursive=True)]
stems = [os.path.splitext(f)[0] for f in files]

stimuli_files = get_minimal_unique_filenames(stimulus_filenames)
stimuli_stems = [os.path.splitext(f)[0] for f in stimuli_files]
stimuli_stems = [os.path.splitext(f)[0] for f in stimulus_filenames]
stimuli_stems = get_keys_from_filenames(stimuli_stems, stems)

if not set(stimuli_stems).issubset(stems):
missing_predictions = set(stimuli_stems).difference(stems)
Expand Down Expand Up @@ -197,14 +235,6 @@ def get_keys_recursive(group, prefix=''):

return keys

def get_stimulus_key(stimulus_name, all_keys):
matching_keys = [key for key in all_keys if key.endswith(stimulus_name)]
if len(matching_keys) == 0:
raise ValueError(f"Stimulus {stimulus_name} not found in hdf5 file!")
elif len(matching_keys) > 1:
raise ValueError(f"Stimulus {stimulus_name} not unique in hdf5 file!")
return matching_keys[0]


class HDF5SaliencyMapModel(SaliencyMapModel):
""" exposes a HDF5 file with saliency maps as pysaliency model
Expand All @@ -220,23 +250,20 @@ def __init__(self, stimuli, filename, check_shape=True, **kwargs):
self.filename = filename
self.check_shape = check_shape

self.names = get_minimal_unique_filenames(
get_stimuli_filenames(stimuli)
)

import h5py
self.hdf5_file = h5py.File(self.filename, 'r')
self.all_keys = get_keys_recursive(self.hdf5_file)

self.names = get_keys_from_filenames(get_stimuli_filenames(stimuli), self.all_keys)

def _saliency_map(self, stimulus):
stimulus_id = get_image_hash(stimulus)
stimulus_index = self.stimuli.stimulus_ids.index(stimulus_id)
stimulus_filename = self.names[stimulus_index]
stimulus_key = get_stimulus_key(stimulus_filename, self.all_keys)
stimulus_key = self.names[stimulus_index]
smap = self.hdf5_file[stimulus_key][:]
if not smap.shape == (stimulus.shape[0], stimulus.shape[1]):
if self.check_shape:
warnings.warn('Wrong shape for stimulus {}'.format(stimulus_key))
warnings.warn('Wrong shape for stimulus {}'.format(stimulus_key), stacklevel=4)
return smap


Expand Down Expand Up @@ -302,8 +329,8 @@ def __init__(self, stimuli, archive_file, *args, **kwargs):
files = [f for f in files if '__macosx' not in f.lower()]
stems = [os.path.splitext(f)[0] for f in files]

stimuli_files = get_minimal_unique_filenames(get_stimuli_filenames(stimuli))
stimuli_stems = [os.path.splitext(f)[0] for f in stimuli_files]
stimuli_stems = [os.path.splitext(f)[0] for f in get_stimuli_filenames(stimuli)]
stimuli_stems = get_keys_from_filenames_with_prefix(stimuli_stems, stems)

prediction_filenames = []
for stimuli_stem in stimuli_stems:
Expand Down
10 changes: 10 additions & 0 deletions tests/test_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,11 @@ def test_stimuli_attributes(stimuli_with_attributes, tmp_path):
assert stimuli_with_attributes.attributes['dva'][:5] == partial_stimuli.attributes['dva']
assert stimuli_with_attributes.attributes['some_strings'][:5] == partial_stimuli.attributes['some_strings']

partial_stimuli = stimuli_with_attributes[[1, 2, 6]]
assert stimuli_with_attributes.attributes.keys() == partial_stimuli.attributes.keys()
assert list(np.array(stimuli_with_attributes.attributes['dva'])[[1, 2, 6]]) == partial_stimuli.attributes['dva']
assert list(np.array(stimuli_with_attributes.attributes['some_strings'])[[1, 2, 6]]) == partial_stimuli.attributes['some_strings']


@pytest.fixture
def file_stimuli_with_attributes(tmpdir):
Expand Down Expand Up @@ -601,6 +606,11 @@ def test_file_stimuli_attributes(file_stimuli_with_attributes, tmp_path):
assert file_stimuli_with_attributes.attributes['dva'][:5] == partial_stimuli.attributes['dva']
assert file_stimuli_with_attributes.attributes['some_strings'][:5] == partial_stimuli.attributes['some_strings']

partial_stimuli = file_stimuli_with_attributes[[1, 2, 6]]
assert file_stimuli_with_attributes.attributes.keys() == partial_stimuli.attributes.keys()
assert list(np.array(file_stimuli_with_attributes.attributes['dva'])[[1, 2, 6]]) == partial_stimuli.attributes['dva']
assert list(np.array(file_stimuli_with_attributes.attributes['some_strings'])[[1, 2, 6]]) == partial_stimuli.attributes['some_strings']


def test_concatenate_stimuli_with_attributes(stimuli_with_attributes, file_stimuli_with_attributes):
concatenated_stimuli = pysaliency.datasets.concatenate_stimuli([stimuli_with_attributes, file_stimuli_with_attributes])
Expand Down
95 changes: 76 additions & 19 deletions tests/test_precomputed_models.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,28 @@
from __future__ import division, print_function, absolute_import, unicode_literals
from __future__ import absolute_import, division, print_function, unicode_literals

import os
import pathlib
import zipfile

import numpy as np
import pytest

from imageio import imsave
import numpy as np

import pysaliency
from pysaliency import export_model_to_hdf5


class TestSaliencyMapModel(pysaliency.SaliencyMapModel):
def _saliency_map(self, stimulus):
stimulus_data = pysaliency.datasets.as_stimulus(stimulus).stimulus_data
return np.array(stimulus_data, dtype=float)


@pytest.fixture
def file_stimuli(tmpdir):
filenames = []
for i in range(3):
# TODO: change back to stimulus_... once this is supported again
filename = tmpdir.join('_stimulus_{:04d}.png'.format(i))
filename = tmpdir.join('stimulus_{:04d}.png'.format(i))
imsave(str(filename), np.random.randint(low=0, high=255, size=(100, 100, 3), dtype=np.uint8))
filenames.append(str(filename))

Expand All @@ -37,8 +41,7 @@ def stimuli_with_filenames(tmpdir):
filenames = []
stimuli = []
for i in range(3):
# TODO: change back to stimulus_... once this is supported again
filename = tmpdir.join('_stimulus_{:04d}.png'.format(i))
filename = tmpdir.join('stimulus_{:04d}.png'.format(i))
stimuli.append(np.random.randint(low=0, high=255, size=(100, 100, 3), dtype=np.uint8))
filenames.append(str(filename))

Expand All @@ -61,6 +64,14 @@ def stimuli(file_stimuli, stimuli_with_filenames, request):
raise ValueError(request.param)


@pytest.fixture
def sub_stimuli(stimuli):
unique_filenames = pysaliency.utils.get_minimal_unique_filenames(
pysaliency.precomputed_models.get_stimuli_filenames(stimuli)
)
return stimuli[[i for i, f in enumerate(unique_filenames) if f.startswith('sub_directory_0001')]]


@pytest.fixture
def saliency_maps_in_directory(file_stimuli, tmpdir):
stimuli_files = pysaliency.utils.get_minimal_unique_filenames(file_stimuli.filenames)
Expand All @@ -80,7 +91,7 @@ def saliency_maps_in_directory(file_stimuli, tmpdir):


def test_export_model_to_hdf5(stimuli, tmpdir):
model = pysaliency.UniformModel()
model = pysaliency.models.SaliencyMapNormalizingModel(TestSaliencyMapModel())
filename = str(tmpdir.join('model.hdf5'))
export_model_to_hdf5(model, stimuli, filename)

Expand All @@ -89,6 +100,16 @@ def test_export_model_to_hdf5(stimuli, tmpdir):
np.testing.assert_allclose(model.log_density(s), model2.log_density(s))


def test_hdf5_model_sub_stimuli(stimuli, sub_stimuli, tmpdir):
model = pysaliency.models.SaliencyMapNormalizingModel(TestSaliencyMapModel())
filename = str(tmpdir.join('model.hdf5'))
export_model_to_hdf5(model, stimuli, filename)

model2 = pysaliency.HDF5Model(sub_stimuli, filename)
for s in sub_stimuli:
np.testing.assert_allclose(model.log_density(s), model2.log_density(s))


def test_export_model_overwrite(file_stimuli, tmpdir):
model1 = pysaliency.GaussianSaliencyMapModel(width=0.1)
model2 = pysaliency.GaussianSaliencyMapModel(width=0.8)
Expand Down Expand Up @@ -124,35 +145,71 @@ def test_export_model_no_overwrite(file_stimuli, tmpdir):
np.testing.assert_allclose(model2.saliency_map(s), model3.saliency_map(s))


def test_saliency_map_model_from_directory(file_stimuli, saliency_maps_in_directory):
def test_saliency_map_model_from_directory(stimuli, saliency_maps_in_directory):
directory, predictions = saliency_maps_in_directory
model = pysaliency.SaliencyMapModelFromDirectory(file_stimuli, directory)
model = pysaliency.SaliencyMapModelFromDirectory(stimuli, directory)

for stimulus_index, stimulus in enumerate(file_stimuli):
for stimulus_index, stimulus in enumerate(stimuli):
expected = predictions[stimulus_index]
actual = model.saliency_map(stimulus)
np.testing.assert_equal(actual, expected)

@pytest.mark.skip("currently archivemodels can't handle same stimuli names in directory and subdirectory")
def test_saliency_map_model_from_archive(file_stimuli, saliency_maps_in_directory, tmpdir):

def test_saliency_map_model_from_directory_sub_stimuli(stimuli, sub_stimuli, saliency_maps_in_directory):
directory, predictions = saliency_maps_in_directory
full_model = pysaliency.SaliencyMapModelFromDirectory(stimuli, directory)
sub_model = pysaliency.SaliencyMapModelFromDirectory(sub_stimuli, directory)

for stimulus in sub_stimuli:
expected = full_model.saliency_map(stimulus)
actual = sub_model.saliency_map(stimulus)
np.testing.assert_equal(actual, expected)


def test_saliency_map_model_from_archive(stimuli, saliency_maps_in_directory, tmpdir):
directory, predictions = saliency_maps_in_directory

archive = tmpdir / 'predictions.zip'

# from https://stackoverflow.com/a/1855118
def zipdir(path, ziph):
for root, dirs, files in os.walk(path):
for root, _, files in os.walk(path):
for file in files:
ziph.write(os.path.join(root, file),
os.path.relpath(os.path.join(root, file),
ziph.write(os.path.join(root, file),
os.path.relpath(os.path.join(root, file),
os.path.join(path, '..')))

with zipfile.ZipFile(str(archive), 'w', zipfile.ZIP_DEFLATED) as zipf:
zipdir(str(directory), zipf)

model = pysaliency.precomputed_models.SaliencyMapModelFromArchive(file_stimuli, str(archive))
model = pysaliency.precomputed_models.SaliencyMapModelFromArchive(stimuli, str(archive))

for stimulus_index, stimulus in enumerate(file_stimuli):
for stimulus_index, stimulus in enumerate(stimuli):
expected = predictions[stimulus_index]
actual = model.saliency_map(stimulus)
np.testing.assert_equal(actual, expected)


def test_saliency_map_model_from_archive_sub_stimuli(stimuli, sub_stimuli, saliency_maps_in_directory, tmpdir):
directory, predictions = saliency_maps_in_directory

archive = tmpdir / 'predictions.zip'

# from https://stackoverflow.com/a/1855118
def zipdir(path, ziph):
for root, _, files in os.walk(path):
for file in files:
ziph.write(os.path.join(root, file),
os.path.relpath(os.path.join(root, file),
os.path.join(path, '..')))

with zipfile.ZipFile(str(archive), 'w', zipfile.ZIP_DEFLATED) as zipf:
zipdir(str(directory), zipf)

full_model = pysaliency.precomputed_models.SaliencyMapModelFromArchive(stimuli, str(archive))
sub_model = pysaliency.precomputed_models.SaliencyMapModelFromArchive(sub_stimuli, str(archive))

for stimulus in sub_stimuli:
expected = full_model.saliency_map(stimulus)
actual = sub_model.saliency_map(stimulus)
np.testing.assert_equal(actual, expected)
Loading