Modified reading of hdf5 files to (1) account for parsing out multipl…

…e conformers, (2) to increase efficiency, and (3) exclude entries where one of the properties contains NAN. Specifically regarding speed, it is very slow to index into an array within the hdf5 file; much much faster to copy to a local array, then index.
choderalab · Sep 15, 2023 · 914d339 · 914d339
1 parent e9f33d4
commit 914d339
Show file tree

Hide file tree

Showing 2 changed files with 47 additions and 18 deletions.
diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
@@ -97,11 +97,14 @@ class HDF5Dataset:
         Path to the processed data file.
     """
 
-    def __init__(self, raw_data_file: str, processed_data_file: str):
+    def __init__(
+        self, raw_data_file: str, processed_data_file: str, local_cache_dir: str
+    ):
         self.raw_data_file = raw_data_file
         self.processed_data_file = processed_data_file
         self.hdf5data: Optional[Dict[str, List]] = None
         self.numpy_data: Optional[np.ndarray] = None
+        self.local_cache_dir = local_cache_dir
 
     def _from_hdf5(self) -> None:
         """
@@ -131,27 +134,51 @@ def _from_hdf5(self) -> None:
         for value in self.properties_of_interest:
             data[value] = []
 
+        data["name"] = []
         logger.debug(f"Processing and extracting data from {self.raw_data_file}")
+
         # this will create an unzipped file which we can then load in
         # this is substantially faster than passing gz_file directly to h5py.File()
+        # and should not run afoul with any chunking of the data.
+        temp_hdf5_file = f"{self.local_cache_dir}/temp_unzipped.hdf5"
         with gzip.open(self.raw_data_file, "rb") as gz_file:
-            with open(self.raw_data_file.replace(".gz", ""), "wb") as out_file:
+            with open(temp_hdf5_file, "wb") as out_file:
                 shutil.copyfileobj(gz_file, out_file)
-                with h5py.File(self.raw_data_file.replace(".gz", ""), "r") as hf:
-                    logger.debug(f"n_entries: {len(hf.keys())}")
-                    for mol in tqdm.tqdm(list(hf.keys())):
-                        n_configs = hf[mol]["n_configs"][()]
-                        for i in range(n_configs):
-                            temp_data = {}
-                            contains_nan = False
-                            for value in self.properties_of_interest:
-                                # if we have a series, we will index into it
-                                if hf[mol][value].attrs["series"]:
-                                    temp_data[value] = hf[mol][value][i]
-                                    if
-                                else:  # if we do not have a series, just append the value
-                                    temp_data[value] = hf[mol][value][()]
-                                    data[value].append(hf[mol][value][()])
+
+        with h5py.File(temp_hdf5_file, "r") as hf:
+            logger.debug(f"n_entries: {len(hf.keys())}")
+            for mol in tqdm.tqdm(list(hf.keys())):
+                n_configs = hf[mol]["n_configs"][()]
+                temp_data = {}
+                is_series = {}
+
+                for value in self.properties_of_interest:
+                    # First grab all the data of interest;
+                    # indexing into a local np array is much faster
+                    # than indexing into the array in the hdf5 file
+                    temp_data[value] = hf[mol][value][()]
+                    is_series[value] = hf[mol][value].attrs["series"]
+
+                for n in range(n_configs):
+                    not_nan = True
+                    temp_data_cut = {}
+                    for value in self.properties_of_interest:
+                        if is_series[value]:
+                            temp_data_cut[value] = temp_data[value][n]
+                            if np.any(np.isnan(temp_data_cut[value])):
+                                not_nan = False
+                                break
+                        else:
+                            temp_data_cut[value] = temp_data[value]
+                            if np.any(np.isnan(temp_data_cut[value])):
+                                not_nan = False
+                                break
+                    if not_nan:
+                        for value in self.properties_of_interest:
+                            data[value].append(temp_data_cut[value])
+                        # keep track of the name of the molecule and configuration number
+                        # may be needed for splitting
+                        data["name"].append(f"{mol}_{n}")
 
         self.hdf5data = data
 

diff --git a/modelforge/dataset/qm9.py b/modelforge/dataset/qm9.py
@@ -18,7 +18,8 @@ class QM9Dataset(HDF5Dataset):
         Name of the dataset, default is "QM9".
     for_unit_testing : bool
         If set to True, a subset of the dataset is used for unit testing purposes; by default False.
-
+    local_cache_dir: str, optional
+            Path to the local cache directory, by default ".".
     Examples
     --------
     >>> data = QM9Dataset()
@@ -77,6 +78,7 @@ def __init__(
         super().__init__(
             f"{local_cache_dir}/{dataset_name}_cache.hdf5.gz",
             f"{local_cache_dir}/{dataset_name}_processed.npz",
+            local_cache_dir=local_cache_dir,
         )
         self.dataset_name = dataset_name
         self.for_unit_testing = for_unit_testing