Skip to content

Commit

Permalink
Modified reading of hdf5 files to (1) account for parsing out multipl…
Browse files Browse the repository at this point in the history
…e conformers, (2) to increase efficiency, and (3) exclude entries where one of the properties contains NAN. Specifically regarding speed, it is very slow to index into an array within the hdf5 file; much much faster to copy to a local array, then index.
  • Loading branch information
chrisiacovella committed Sep 15, 2023
1 parent e9f33d4 commit 914d339
Show file tree
Hide file tree
Showing 2 changed files with 47 additions and 18 deletions.
61 changes: 44 additions & 17 deletions modelforge/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,11 +97,14 @@ class HDF5Dataset:
Path to the processed data file.
"""

def __init__(self, raw_data_file: str, processed_data_file: str):
def __init__(
self, raw_data_file: str, processed_data_file: str, local_cache_dir: str
):
self.raw_data_file = raw_data_file
self.processed_data_file = processed_data_file
self.hdf5data: Optional[Dict[str, List]] = None
self.numpy_data: Optional[np.ndarray] = None
self.local_cache_dir = local_cache_dir

def _from_hdf5(self) -> None:
"""
Expand Down Expand Up @@ -131,27 +134,51 @@ def _from_hdf5(self) -> None:
for value in self.properties_of_interest:
data[value] = []

data["name"] = []
logger.debug(f"Processing and extracting data from {self.raw_data_file}")

# this will create an unzipped file which we can then load in
# this is substantially faster than passing gz_file directly to h5py.File()
# and should not run afoul with any chunking of the data.
temp_hdf5_file = f"{self.local_cache_dir}/temp_unzipped.hdf5"
with gzip.open(self.raw_data_file, "rb") as gz_file:
with open(self.raw_data_file.replace(".gz", ""), "wb") as out_file:
with open(temp_hdf5_file, "wb") as out_file:
shutil.copyfileobj(gz_file, out_file)
with h5py.File(self.raw_data_file.replace(".gz", ""), "r") as hf:
logger.debug(f"n_entries: {len(hf.keys())}")
for mol in tqdm.tqdm(list(hf.keys())):
n_configs = hf[mol]["n_configs"][()]
for i in range(n_configs):
temp_data = {}
contains_nan = False
for value in self.properties_of_interest:
# if we have a series, we will index into it
if hf[mol][value].attrs["series"]:
temp_data[value] = hf[mol][value][i]
if
else: # if we do not have a series, just append the value
temp_data[value] = hf[mol][value][()]
data[value].append(hf[mol][value][()])

with h5py.File(temp_hdf5_file, "r") as hf:
logger.debug(f"n_entries: {len(hf.keys())}")
for mol in tqdm.tqdm(list(hf.keys())):
n_configs = hf[mol]["n_configs"][()]
temp_data = {}
is_series = {}

for value in self.properties_of_interest:
# First grab all the data of interest;
# indexing into a local np array is much faster
# than indexing into the array in the hdf5 file
temp_data[value] = hf[mol][value][()]
is_series[value] = hf[mol][value].attrs["series"]

for n in range(n_configs):
not_nan = True
temp_data_cut = {}
for value in self.properties_of_interest:
if is_series[value]:
temp_data_cut[value] = temp_data[value][n]
if np.any(np.isnan(temp_data_cut[value])):
not_nan = False
break
else:
temp_data_cut[value] = temp_data[value]
if np.any(np.isnan(temp_data_cut[value])):
not_nan = False
break
if not_nan:
for value in self.properties_of_interest:
data[value].append(temp_data_cut[value])
# keep track of the name of the molecule and configuration number
# may be needed for splitting
data["name"].append(f"{mol}_{n}")

self.hdf5data = data

Expand Down
4 changes: 3 additions & 1 deletion modelforge/dataset/qm9.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,8 @@ class QM9Dataset(HDF5Dataset):
Name of the dataset, default is "QM9".
for_unit_testing : bool
If set to True, a subset of the dataset is used for unit testing purposes; by default False.
local_cache_dir: str, optional
Path to the local cache directory, by default ".".
Examples
--------
>>> data = QM9Dataset()
Expand Down Expand Up @@ -77,6 +78,7 @@ def __init__(
super().__init__(
f"{local_cache_dir}/{dataset_name}_cache.hdf5.gz",
f"{local_cache_dir}/{dataset_name}_processed.npz",
local_cache_dir=local_cache_dir,
)
self.dataset_name = dataset_name
self.for_unit_testing = for_unit_testing
Expand Down

0 comments on commit 914d339

Please sign in to comment.