From 062ad7ffa86288b2a19812764dcf02ac805b48d1 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Tue, 8 Jun 2021 15:36:10 +0200 Subject: [PATCH 01/11] Ignore data chunks for empty datasets --- extra_data/keydata.py | 8 +++++++ extra_data/tests/test_keydata.py | 37 ++++++++++++++++++++++++++++++-- 2 files changed, 43 insertions(+), 2 deletions(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index 2846a852..4cc03a40 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -28,6 +28,12 @@ def __init__( def _find_chunks(self): """Find contiguous chunks of data for this key, in any order.""" for file in self.files: + if file.file[self.hdf5_data_path].size == 0: + # this file does not contain data for this key we skip the files here + # for cases where index claims there's data for this key, but the + # dataset is empty. + continue + firsts, counts = file.get_index(self.source, self._key_group) # Of trains in this file, which are in selection @@ -168,6 +174,8 @@ def _trainid_index(self): np.repeat(chunk.train_ids, chunk.counts.astype(np.intp)) for chunk in self._data_chunks ] + if not chunks_trainids: + return chunks_trainids return np.concatenate(chunks_trainids) def xarray(self, extra_dims=None, roi=(), name=None): diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py index 637a1f83..119b62d6 100644 --- a/extra_data/tests/test_keydata.py +++ b/extra_data/tests/test_keydata.py @@ -1,8 +1,24 @@ +import os.path as osp + +from h5py import File import numpy as np import pytest -from extra_data import RunDirectory +from extra_data import H5File, RunDirectory from extra_data.exceptions import TrainIDError +from tempfile import TemporaryDirectory + +from . import make_examples + + +@pytest.fixture(scope='function') +def data_aggregator_file(): + with TemporaryDirectory() as td: + path = osp.join(td, 'RAW-R0450-DA01-S00001.h5') + make_examples.make_fxe_da_file(path) + + yield path + def test_get_keydata(mock_spb_raw_run): run = RunDirectory(mock_spb_raw_run) @@ -95,7 +111,7 @@ def test_data_counts(mock_reduced_spb_proc_run): assert count.index.tolist() == xgm_beam_x.train_ids assert (count.values == 1).all() - # intrument data + # instrument data camera = run['SPB_IRU_CAM/CAM/SIDEMIC:daqOutput', 'data.image.pixels'] count = camera.data_counts() assert count.index.tolist() == camera.train_ids @@ -113,3 +129,20 @@ def test_select_by(mock_spb_raw_run): subrun = run.select(am0) assert subrun.all_sources == {am0.source} assert subrun.keys_for_source(am0.source) == {am0.key} + + +def test_empty_dataset(data_aggregator_file): + with File(data_aggregator_file, 'a') as f: + shape = f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].shape + f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].resize((0, *shape[1:])) + + shape = f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].shape + f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].resize((0, *shape[1:])) + + run = H5File(data_aggregator_file) + kd = run['SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD'] + assert kd.ndarray().size == 0 + assert kd.xarray().size == 0 + + kd = run['SA1_XTD2_XGM/DOOCS/MAIN', 'pulseEnergy.photonFlux.value'] + assert kd.series().size == 0 From 1b408945a2f9d484dfead1f2976a4cfd6ce6c69d Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Tue, 8 Jun 2021 18:36:47 +0200 Subject: [PATCH 02/11] skip empty dataset --- extra_data/keydata.py | 12 ++++++--- extra_data/reader.py | 11 +++++--- extra_data/tests/conftest.py | 16 +++++++++++ extra_data/tests/test_keydata.py | 34 ++++++++---------------- extra_data/tests/test_reader_mockdata.py | 21 +++++++++++++++ 5 files changed, 64 insertions(+), 30 deletions(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index 4cc03a40..59fa04e4 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List, Optional, Tuple import numpy as np @@ -287,6 +287,8 @@ def dask_array(self, labelled=False): )[chunk.slice] ) + if not chunks_darrs: + chunks_darrs = [da.empty(shape=self.shape, dtype=self.dtype, chunks=self.shape)] dask_arr = da.concatenate(chunks_darrs, axis=0) if labelled: @@ -303,7 +305,7 @@ def dask_array(self, labelled=False): # Getting data by train: -------------------------------------------------- - def _find_tid(self, tid) -> (Optional[FileAccess], int): + def _find_tid(self, tid) -> Tuple[Optional[FileAccess], int]: for fa in self.files: matches = (fa.train_ids == tid).nonzero()[0] if self.inc_suspect_trains and matches.size > 0: @@ -324,8 +326,8 @@ def train_from_id(self, tid): raise TrainIDError(tid) fa, ix = self._find_tid(tid) - if fa is None: - return np.empty((0,) + self.entry_shape, dtype=self.dtype) + if fa is None or fa.file[self.hdf5_data_path].size == 0: + return tid, np.empty((0,) + self.entry_shape, dtype=self.dtype) firsts, counts = fa.get_index(self.source, self._key_group) first, count = firsts[ix], counts[ix] @@ -349,6 +351,8 @@ def trains(self): for chunk in self._data_chunks: start = chunk.first ds = chunk.dataset + if ds.size == 0: + continue for tid, count in zip(chunk.train_ids, chunk.counts): if count > 1: yield tid, ds[start: start+count] diff --git a/extra_data/reader.py b/extra_data/reader.py index 4907cacb..3f38d734 100644 --- a/extra_data/reader.py +++ b/extra_data/reader.py @@ -28,6 +28,7 @@ import sys import tempfile import time +from typing import Tuple from warnings import warn from .exceptions import ( @@ -381,6 +382,8 @@ def train_from_id(self, train_id, devices=None, *, flat_keys=False): for key in self.keys_for_source(source): path = '/CONTROL/{}/{}'.format(source, key.replace('.', '/')) + if file.file[path].size == 0: + continue source_data[key] = file.file[path][pos] for source in self.instrument_sources: @@ -399,6 +402,8 @@ def train_from_id(self, train_id, devices=None, *, flat_keys=False): continue path = '/INSTRUMENT/{}/{}'.format(source, key.replace('.', '/')) + if file.file[path].size == 0: + continue if count == 1: source_data[key] = file.file[path][first] else: @@ -997,7 +1002,7 @@ def _find_data_chunks(self, source, key): """ return self._get_key_data(source, key)._data_chunks - def _find_data(self, source, train_id) -> (FileAccess, int): + def _find_data(self, source, train_id) -> Tuple[FileAccess, int]: for f in self._source_index[source]: ixs = (f.train_ids == train_id).nonzero()[0] if self.inc_suspect_trains and ixs.size > 0: @@ -1367,7 +1372,7 @@ def _assemble_data(self, tid): for key in self.data.keys_for_source(source): _, pos, ds = self._find_data(source, key, tid) - if ds is None: + if ds is None or ds.size == 0: continue self._set_result(res, source, key, ds[pos]) @@ -1376,7 +1381,7 @@ def _assemble_data(self, tid): {'source': source, 'timestamp.tid': tid}) for key in self.data.keys_for_source(source): file, pos, ds = self._find_data(source, key, tid) - if ds is None: + if ds is None or ds.size == 0: continue group = key.partition('.')[0] firsts, counts = file.get_index(source, group) diff --git a/extra_data/tests/conftest.py b/extra_data/tests/conftest.py index e9f53d6b..81f52762 100644 --- a/extra_data/tests/conftest.py +++ b/extra_data/tests/conftest.py @@ -122,3 +122,19 @@ def mock_empty_file(): path = osp.join(td, 'RAW-R0450-DA01-S00002.h5') make_examples.make_sa3_da_file(path, ntrains=0) yield path + + +@pytest.fixture(scope='function') +def mock_empty_dataset_file(format_version): + with TemporaryDirectory() as td: + path = osp.join(td, 'RAW-R0999-DA10-S00001.h5') + make_examples.make_fxe_da_file(path, format_version=format_version) + + with h5py.File(path, 'a') as f: + shape = f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].shape + f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].resize((0, *shape[1:])) + + shape = f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].shape + f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].resize((0, *shape[1:])) + + yield path diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py index 119b62d6..75464496 100644 --- a/extra_data/tests/test_keydata.py +++ b/extra_data/tests/test_keydata.py @@ -1,23 +1,8 @@ -import os.path as osp - -from h5py import File import numpy as np import pytest from extra_data import H5File, RunDirectory from extra_data.exceptions import TrainIDError -from tempfile import TemporaryDirectory - -from . import make_examples - - -@pytest.fixture(scope='function') -def data_aggregator_file(): - with TemporaryDirectory() as td: - path = osp.join(td, 'RAW-R0450-DA01-S00001.h5') - make_examples.make_fxe_da_file(path) - - yield path def test_get_keydata(mock_spb_raw_run): @@ -131,18 +116,21 @@ def test_select_by(mock_spb_raw_run): assert subrun.keys_for_source(am0.source) == {am0.key} -def test_empty_dataset(data_aggregator_file): - with File(data_aggregator_file, 'a') as f: - shape = f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].shape - f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].resize((0, *shape[1:])) - - shape = f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].shape - f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].resize((0, *shape[1:])) +def test_empty_dataset(mock_empty_dataset_file): - run = H5File(data_aggregator_file) + run = H5File(mock_empty_dataset_file) kd = run['SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD'] assert kd.ndarray().size == 0 assert kd.xarray().size == 0 + assert kd.dask_array().size == 0 + + tid, data = kd.train_from_index(0) + assert data.shape == (0, 1000) kd = run['SA1_XTD2_XGM/DOOCS/MAIN', 'pulseEnergy.photonFlux.value'] assert kd.series().size == 0 + tid, data = kd.train_from_index(0) + assert tid == 10000 + assert data.shape == (0,) + + assert len(list(kd.trains())) == 0 diff --git a/extra_data/tests/test_reader_mockdata.py b/extra_data/tests/test_reader_mockdata.py index 8ba38447..efe23615 100644 --- a/extra_data/tests/test_reader_mockdata.py +++ b/extra_data/tests/test_reader_mockdata.py @@ -804,3 +804,24 @@ def test_run_metadata(mock_spb_raw_run): 'sample', 'sequenceNumber', } assert isinstance(md['creationDate'], str) + + +def test_empty_dataset(mock_empty_dataset_file): + + run = H5File(mock_empty_dataset_file) + device, key = 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD' + + assert run.get_array(device, key).size == 0 + assert run.get_dask_array(device, key).size == 0 + + sel = run.select(device, key) + _, data = sel.train_from_index(0) + assert list(data[device].keys()) == ['metadata'] + + # assert len(list(sel.trains(require_all=True))) == 0 + for _, data in sel.trains(require_all=True): + assert key not in data[device] + break + + _, data = sel.train_from_index(0) + assert key not in data[device] From 1a1868ca31bac092cbf62db936982099fad737ac Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Tue, 8 Jun 2021 19:10:27 +0200 Subject: [PATCH 03/11] handle pd.Series for instrument_keys in DataCollection --- extra_data/reader.py | 7 ++++++- extra_data/tests/conftest.py | 6 ++++++ extra_data/tests/test_reader_mockdata.py | 9 ++++++++- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/extra_data/reader.py b/extra_data/reader.py index 3f38d734..a888e05c 100644 --- a/extra_data/reader.py +++ b/extra_data/reader.py @@ -486,6 +486,8 @@ def get_series(self, source, key): if source in self.instrument_sources: data_path = "/INSTRUMENT/{}/{}".format(source, key.replace('.', '/')) for f in self._source_index[source]: + if f.file[data_path].size == 0: + continue group = key.partition('.')[0] firsts, counts = f.get_index(source, group) trainids = self._expand_trainids(counts, f.train_ids) @@ -515,7 +517,10 @@ def get_series(self, source, key): else: return self._get_key_data(source, key).series() - ser = pd.concat(sorted(seq_series, key=lambda s: s.index[0])) + if not seq_series: + ser = pd.Series([]) + else: + ser = pd.concat(sorted(seq_series, key=lambda s: s.index[0])) # Select out only the train IDs of interest if isinstance(ser.index, pd.MultiIndex): diff --git a/extra_data/tests/conftest.py b/extra_data/tests/conftest.py index 81f52762..60ee9532 100644 --- a/extra_data/tests/conftest.py +++ b/extra_data/tests/conftest.py @@ -134,7 +134,13 @@ def mock_empty_dataset_file(format_version): shape = f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].shape f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].resize((0, *shape[1:])) + shape = f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/trainId'].shape + f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/trainId'].resize((0, *shape[1:])) + shape = f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].shape f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].resize((0, *shape[1:])) + shape = f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/beamPosition/ixPos/value'].shape + f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/beamPosition/ixPos/value'].resize((0, *shape[1:])) + yield path diff --git a/extra_data/tests/test_reader_mockdata.py b/extra_data/tests/test_reader_mockdata.py index efe23615..a1f4c13c 100644 --- a/extra_data/tests/test_reader_mockdata.py +++ b/extra_data/tests/test_reader_mockdata.py @@ -818,10 +818,17 @@ def test_empty_dataset(mock_empty_dataset_file): _, data = sel.train_from_index(0) assert list(data[device].keys()) == ['metadata'] - # assert len(list(sel.trains(require_all=True))) == 0 for _, data in sel.trains(require_all=True): assert key not in data[device] break _, data = sel.train_from_index(0) assert key not in data[device] + + s = run.get_series(device, 'data.trainId') + assert isinstance(s, pd.Series) + assert len(s) == 0 + + df = run.get_dataframe(fields=[("*_XGM/*", "*.i[xy]Pos*")]) + assert len(df.columns) == 4 + assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos" in df.columns From 2209e747dfb4908fbdf0045651d5202c1c05a934 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 9 Jun 2021 09:28:26 +0200 Subject: [PATCH 04/11] dosctring --- extra_data/keydata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index 59fa04e4..b6c59952 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -29,7 +29,7 @@ def _find_chunks(self): """Find contiguous chunks of data for this key, in any order.""" for file in self.files: if file.file[self.hdf5_data_path].size == 0: - # this file does not contain data for this key we skip the files here + # this file does not contain data for this key. We skip the files here # for cases where index claims there's data for this key, but the # dataset is empty. continue From 48684b50863d335191aae88be3485c7680992266 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 9 Jun 2021 13:20:32 +0200 Subject: [PATCH 05/11] add missing data types --- extra_data/keydata.py | 5 +++-- extra_data/reader.py | 2 +- extra_data/tests/conftest.py | 15 ++++----------- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index b6c59952..8648e9d7 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -175,7 +175,7 @@ def _trainid_index(self): for chunk in self._data_chunks ] if not chunks_trainids: - return chunks_trainids + return np.array([], dtype=np.uint64) return np.concatenate(chunks_trainids) def xarray(self, extra_dims=None, roi=(), name=None): @@ -288,7 +288,8 @@ def dask_array(self, labelled=False): ) if not chunks_darrs: - chunks_darrs = [da.empty(shape=self.shape, dtype=self.dtype, chunks=self.shape)] + chunks_darrs = [da.empty(shape=(0,) + self.entry_shape, + dtype=self.dtype, chunks=self.shape)] dask_arr = da.concatenate(chunks_darrs, axis=0) if labelled: diff --git a/extra_data/reader.py b/extra_data/reader.py index a888e05c..d5d040b5 100644 --- a/extra_data/reader.py +++ b/extra_data/reader.py @@ -518,7 +518,7 @@ def get_series(self, source, key): return self._get_key_data(source, key).series() if not seq_series: - ser = pd.Series([]) + ser = pd.Series([], dtype=self._get_key_data(source, key).dtype) else: ser = pd.concat(sorted(seq_series, key=lambda s: s.index[0])) diff --git a/extra_data/tests/conftest.py b/extra_data/tests/conftest.py index 60ee9532..901f902c 100644 --- a/extra_data/tests/conftest.py +++ b/extra_data/tests/conftest.py @@ -131,16 +131,9 @@ def mock_empty_dataset_file(format_version): make_examples.make_fxe_da_file(path, format_version=format_version) with h5py.File(path, 'a') as f: - shape = f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].shape - f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].resize((0, *shape[1:])) - - shape = f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/trainId'].shape - f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/trainId'].resize((0, *shape[1:])) - - shape = f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].shape - f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].resize((0, *shape[1:])) - - shape = f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/beamPosition/ixPos/value'].shape - f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/beamPosition/ixPos/value'].resize((0, *shape[1:])) + f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/intensityTD'].resize(0, axis=0) + f['INSTRUMENT/SA1_XTD2_XGM/DOOCS/MAIN:output/data/trainId'].resize(0, axis=0) + f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/pulseEnergy/photonFlux/value'].resize(0, axis=0) + f['CONTROL/SA1_XTD2_XGM/DOOCS/MAIN/beamPosition/ixPos/value'].resize(0, axis=0) yield path From 122e6a97d82957b80a6f14f6e61587b1ea836d22 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 9 Jun 2021 14:03:47 +0200 Subject: [PATCH 06/11] fix data counts when dataset are empty --- extra_data/keydata.py | 4 +++- extra_data/tests/test_keydata.py | 1 + extra_data/tests/test_reader_mockdata.py | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/extra_data/keydata.py b/extra_data/keydata.py index 8648e9d7..5231acb9 100644 --- a/extra_data/keydata.py +++ b/extra_data/keydata.py @@ -122,7 +122,9 @@ def data_counts(self): seq_series = [] for f in self.files: - if self.section == 'CONTROL': + if f.file[self.hdf5_data_path].size == 0: + counts = np.zeros_like(f.train_ids, dtype=np.uint64) + elif self.section == 'CONTROL': counts = np.ones(len(f.train_ids), dtype=np.uint64) else: _, counts = f.get_index(self.source, self._key_group) diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py index 75464496..ee8e1f05 100644 --- a/extra_data/tests/test_keydata.py +++ b/extra_data/tests/test_keydata.py @@ -120,6 +120,7 @@ def test_empty_dataset(mock_empty_dataset_file): run = H5File(mock_empty_dataset_file) kd = run['SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD'] + assert not kd.data_count().values.any() assert kd.ndarray().size == 0 assert kd.xarray().size == 0 assert kd.dask_array().size == 0 diff --git a/extra_data/tests/test_reader_mockdata.py b/extra_data/tests/test_reader_mockdata.py index a1f4c13c..809959e5 100644 --- a/extra_data/tests/test_reader_mockdata.py +++ b/extra_data/tests/test_reader_mockdata.py @@ -811,6 +811,7 @@ def test_empty_dataset(mock_empty_dataset_file): run = H5File(mock_empty_dataset_file) device, key = 'SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD' + assert not run.get_data_counts(device, key).any() assert run.get_array(device, key).size == 0 assert run.get_dask_array(device, key).size == 0 From a3b1a018cecc4415171eb93c949707807fafb6b3 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 9 Jun 2021 14:23:57 +0200 Subject: [PATCH 07/11] typo --- extra_data/tests/test_keydata.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/extra_data/tests/test_keydata.py b/extra_data/tests/test_keydata.py index ee8e1f05..91130501 100644 --- a/extra_data/tests/test_keydata.py +++ b/extra_data/tests/test_keydata.py @@ -120,7 +120,7 @@ def test_empty_dataset(mock_empty_dataset_file): run = H5File(mock_empty_dataset_file) kd = run['SA1_XTD2_XGM/DOOCS/MAIN:output', 'data.intensityTD'] - assert not kd.data_count().values.any() + assert not kd.data_counts().values.any() assert kd.ndarray().size == 0 assert kd.xarray().size == 0 assert kd.dask_array().size == 0 From 24a16b1353424d56615da7f73e6fda0a08d3789f Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 9 Jun 2021 15:36:53 +0200 Subject: [PATCH 08/11] check for empty sources in .select --- extra_data/reader.py | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/extra_data/reader.py b/extra_data/reader.py index d5d040b5..9c12b406 100644 --- a/extra_data/reader.py +++ b/extra_data/reader.py @@ -850,34 +850,16 @@ def select(self, seln_or_source_glob, key_glob='*', require_all=False): train_ids = self.train_ids for source, keys in selection.items(): - if source in self.instrument_sources: - # For INSTRUMENT sources, the INDEX is saved by - # key group, which is the first hash component. In - # many cases this is 'data', but not always. - if keys is None: - # All keys are selected. - keys = self.keys_for_source(source) - - groups = {key.partition('.')[0] for key in keys} - else: - # CONTROL data has no key group. - groups = [''] - - for group in groups: - # Empty list would be converted to np.float64 array. - source_tids = np.empty(0, dtype=np.uint64) + if keys is None: + keys = self.keys_for_source(source) - for f in self._source_index[source]: - valid = True if self.inc_suspect_trains else f.validity_flag - # Add the trains with data in each file. - _, counts = f.get_index(source, group) - source_tids = np.union1d( - f.train_ids[valid & (counts > 0)], source_tids - ) + for key in keys: + counts = self._get_key_data(source, key).data_counts() + key_tids = counts[counts>0].index.values # Remove any trains previously selected, for which this # selected source and key group has no data. - train_ids = np.intersect1d(train_ids, source_tids) + train_ids = np.intersect1d(train_ids, key_tids) # Filtering may have eliminated previously selected files. files = [f for f in files From 78e557e5a95b0698bedb235e99d7eaefc7cb8e1f Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 9 Jun 2021 15:58:02 +0200 Subject: [PATCH 09/11] return if empty selection --- extra_data/reader.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/extra_data/reader.py b/extra_data/reader.py index 9c12b406..15cec954 100644 --- a/extra_data/reader.py +++ b/extra_data/reader.py @@ -861,6 +861,13 @@ def select(self, seln_or_source_glob, key_glob='*', require_all=False): # selected source and key group has no data. train_ids = np.intersect1d(train_ids, key_tids) + if train_ids.size == 0: + return DataCollection( + [], selection={}, train_ids=[], + inc_suspect_trains=self.inc_suspect_trains, + is_single_run=self.is_single_run + ) + # Filtering may have eliminated previously selected files. files = [f for f in files if f.has_train_ids(train_ids, self.inc_suspect_trains)] From c28b72bd8e41e98339782ff9cd517679e6693c21 Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 9 Jun 2021 16:36:33 +0200 Subject: [PATCH 10/11] test selection --- extra_data/reader.py | 2 +- extra_data/tests/test_reader_mockdata.py | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/extra_data/reader.py b/extra_data/reader.py index 15cec954..1f2ee718 100644 --- a/extra_data/reader.py +++ b/extra_data/reader.py @@ -863,7 +863,7 @@ def select(self, seln_or_source_glob, key_glob='*', require_all=False): if train_ids.size == 0: return DataCollection( - [], selection={}, train_ids=[], + [], selection=selection, train_ids=[], inc_suspect_trains=self.inc_suspect_trains, is_single_run=self.is_single_run ) diff --git a/extra_data/tests/test_reader_mockdata.py b/extra_data/tests/test_reader_mockdata.py index 809959e5..e114d02a 100644 --- a/extra_data/tests/test_reader_mockdata.py +++ b/extra_data/tests/test_reader_mockdata.py @@ -833,3 +833,7 @@ def test_empty_dataset(mock_empty_dataset_file): df = run.get_dataframe(fields=[("*_XGM/*", "*.i[xy]Pos*")]) assert len(df.columns) == 4 assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos" in df.columns + + dc = run.select(device, require_all=True) + assert dc.all_sources == frozenset([device]) + assert dc.train_ids == [] From 19f1efbf4feb4ee5903a93d584def42f1babb9de Mon Sep 17 00:00:00 2001 From: Thomas Michelat Date: Wed, 9 Jun 2021 16:38:40 +0200 Subject: [PATCH 11/11] test selection --- extra_data/tests/test_reader_mockdata.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/extra_data/tests/test_reader_mockdata.py b/extra_data/tests/test_reader_mockdata.py index e114d02a..cb80ebfa 100644 --- a/extra_data/tests/test_reader_mockdata.py +++ b/extra_data/tests/test_reader_mockdata.py @@ -835,5 +835,6 @@ def test_empty_dataset(mock_empty_dataset_file): assert "SA1_XTD2_XGM/DOOCS/MAIN/beamPosition.ixPos" in df.columns dc = run.select(device, require_all=True) - assert dc.all_sources == frozenset([device]) + assert dc.selection == {device: None} + assert dc.all_sources == frozenset() assert dc.train_ids == []