Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ignore incomplete entries in JSON 'run files map' caches #547

Merged
merged 4 commits into from
Sep 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions extra_data/file_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,8 +146,8 @@ def __init__(self, filename, _cache_info=None):
self.train_ids = _cache_info['train_ids']
self.control_sources = _cache_info['control_sources']
self.instrument_sources = _cache_info['instrument_sources']
self.legacy_sources = _cache_info.get('legacy_sources', {})
self.validity_flag = _cache_info.get('flag', None)
self.legacy_sources = _cache_info['legacy_sources']
self.validity_flag = _cache_info['flag']
else:
try:
tid_data = self.file['INDEX/trainId'][:]
Expand Down
41 changes: 22 additions & 19 deletions extra_data/run_files_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ class RunFilesMap:
"""
cache_file = None

expected_cache_keys = frozenset({
'train_ids',
'control_sources',
'instrument_sources',
'suspect_train_indices',
'legacy_sources',
})

def __init__(self, directory):
self.directory = osp.abspath(directory)
self.dir_stat = os.stat(self.directory)
Expand Down Expand Up @@ -89,7 +97,7 @@ def map_paths_for_run(self, directory):
def load(self):
"""Load the cached data

This skips over invalid cache entries(based on the file's size & mtime).
This drops invalid or incomplete cache entries.
"""
loaded_data = []
t0 = time.monotonic()
Expand Down Expand Up @@ -121,13 +129,21 @@ def load(self):
st = os.stat(osp.join(self.directory, filename))
except OSError:
continue
if (st.st_mtime == info['mtime']) and (st.st_size == info['size']):
if self._cache_info_valid(info, st):
self.files_data[filename] = info

if loaded_data:
dt = time.monotonic() - t0
log.debug("Loaded cached files map in %.2g s", dt)

@classmethod
def _cache_info_valid(cls, info, file_stat: os.stat_result):
# Ignore the cached info if the file size or mtime have changed, or
# if it is missing expected keys (likely keys added more recently).
return ((file_stat.st_mtime == info['mtime'])
and (file_stat.st_size == info['size'])
and cls.expected_cache_keys.issubset(info.keys()))

def is_my_directory(self, dir_path):
return osp.samestat(os.stat(dir_path), self.dir_stat)

Expand All @@ -143,27 +159,14 @@ def get(self, path):
'train_ids': np.array(d['train_ids'], dtype=np.uint64),
'control_sources': frozenset(d['control_sources']),
'instrument_sources': frozenset(d['instrument_sources']),
'legacy_sources': dict(d['legacy_sources']),
}
# Older cache files don't contain info on legacy sources.
if 'legacy_sources' in d:
res['legacy_sources'] = d['legacy_sources']
# Older cache files don't contain info on 'suspect' trains.
if 'suspect_train_indices' in d:
res['flag'] = flag = np.ones_like(d['train_ids'], dtype=np.bool_)
flag[d['suspect_train_indices']] = 0
res['flag'] = flag = np.ones_like(d['train_ids'], dtype=np.bool_)
flag[d['suspect_train_indices']] = 0
return res

return None

def _cache_valid(self, fname):
# The cache is invalid (needs to be written out) if the file is not in
# files_data (which it won't be if the size or mtime don't match - see
# load()), or if the later added suspect_train_indices/legagy_sources
# are missing. These may be missing from caches created by legacy
# versions of EXtra-data.
return not bool({'suspect_train_indices', 'legacy_sources'} \
- self.files_data.get(fname, {}).keys())

def save(self, files):
"""Save the cache if needed

Expand All @@ -175,7 +178,7 @@ def save(self, files):

for file_access in files:
dirname, fname = osp.split(osp.abspath(file_access.filename))
if self.is_my_directory(dirname) and not self._cache_valid(fname):
if self.is_my_directory(dirname) and fname not in self.files_data:
log.debug("Will save cached data for %s", fname)
need_save = True

Expand Down
3 changes: 2 additions & 1 deletion extra_data/tests/test_file_access.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,8 @@ def test_pickle(pickle_mod, mock_sa3_control_data):
train_ids= np.zeros(0, dtype=np.uint64),
control_sources=frozenset(),
instrument_sources=frozenset(),
flag=np.zeros(0, dtype=np.int32)
flag=np.zeros(0, dtype=np.int32),
legacy_sources={},
)


Expand Down
Loading