Merge pull request #245 from chrisiacovella/phalkethoh_datasets

Updated Phalkethoh datasets
choderalab · Aug 30, 2024 · 5551bea · 5551bea
2 parents 83bd334 + bf1d04c
commit 5551bea
Show file tree

Hide file tree

Showing 5 changed files with 230 additions and 95 deletions.
diff --git a/modelforge/curation/phalkethoh_curation.py b/modelforge/curation/phalkethoh_curation.py
@@ -267,7 +267,7 @@ def _calculate_total_charge(
         rdmol = Chem.MolFromSmiles(smiles, sanitize=False)
         total_charge = sum(atom.GetFormalCharge() for atom in rdmol.GetAtoms())
 
-        return (int(total_charge) * unit.elementary_charge,)
+        return int(total_charge) * unit.elementary_charge
 
     def _process_downloaded(
         self,
@@ -277,6 +277,8 @@ def _process_downloaded(
         max_conformers_per_record: Optional[int] = None,
         total_conformers: Optional[int] = None,
         atomic_numbers_to_limit: Optional[List[int]] = None,
+        max_force: Optional[unit.Quantity] = None,
+        final_conformer_only: Optional[bool] = None,
     ):
         """
         Processes a downloaded dataset: extracts relevant information.
@@ -295,6 +297,11 @@ def _process_downloaded(
             If set, this will limit the total number of conformers to the specified number.
         atomic_numbers_to_limit: Optional[List[int]], optional, default=None
             If set, this will limit the dataset to only include molecules with atomic numbers in the list.
+        max_force: Optional[float], optional, default=None
+            If set, this will exclude any conformers with a force that exceeds this value.
+        final_conformer_only: Optional[bool], optional, default=None
+            If set to True, only the final conformer of each record will be processed. This should be the final
+            energy minimized conformer.
         """
         from tqdm import tqdm
         import numpy as np
@@ -358,7 +365,7 @@ def _process_downloaded(
                         ]
                         data_temp["n_configs"] = 0
 
-                        (data_temp["total_charge"],) = self._calculate_total_charge(
+                        data_temp["total_charge"] = self._calculate_total_charge(
                             data_temp[
                                 "canonical_isomeric_explicit_hydrogen_mapped_smiles"
                             ]
@@ -377,105 +384,123 @@ def _process_downloaded(
                     name = key
                     index = self.molecule_names[name]
 
+                    if final_conformer_only:
+                        trajectory = [trajectory[-1]]
                     for state in trajectory:
+                        add_record = True
                         properties, config = state
-                        self.data[index]["n_configs"] += 1
-
-                        # note, we will use the convention of names being lowercase
-                        # and spaces denoted by underscore
-                        quantity = "geometry"
-                        quantity_o = "geometry"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = config.reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    config.reshape(1, -1, 3),
+
+                        # if set, let us see if the configuration has a force that exceeds the maximum
+                        if max_force is not None:
+                            force_magnitude = (
+                                np.abs(
+                                    properties["properties"]["current gradient"]
+                                    + properties["properties"][
+                                        "dispersion correction gradient"
+                                    ]
                                 )
+                                * self.qm_parameters["dft_total_force"]["u_in"]
                             )
+                            if np.any(force_magnitude > max_force):
+                                add_record = False
+                        if add_record:
+                            self.data[index]["n_configs"] += 1
+
+                            # note, we will use the convention of names being lowercase
+                            # and spaces denoted by underscore
+                            quantity = "geometry"
+                            quantity_o = "geometry"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = config.reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        config.reshape(1, -1, 3),
+                                    )
+                                )
 
-                        # note, we will use the convention of names being lowercase
-                        # and spaces denoted by underscore
-                        quantity = "current energy"
-                        quantity_o = "dft_total_energy"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = properties["properties"][
-                                quantity
-                            ]
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    properties["properties"][quantity],
+                            # note, we will use the convention of names being lowercase
+                            # and spaces denoted by underscore
+                            quantity = "current energy"
+                            quantity_o = "dft_total_energy"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = properties["properties"][
+                                    quantity
+                                ]
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        properties["properties"][quantity],
+                                    )
                                 )
-                            )
 
-                        quantity = "dispersion correction energy"
-                        quantity_o = "dispersion_correction_energy"
-                        # Note need to typecast here because of a bug in the
-                        # qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                float(properties["properties"][quantity])
-                            ).reshape(1, 1)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        float(properties["properties"][quantity])
-                                    ).reshape(1, 1),
-                                ),
-                            )
+                            quantity = "dispersion correction energy"
+                            quantity_o = "dispersion_correction_energy"
+                            # Note need to typecast here because of a bug in the
+                            # qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    float(properties["properties"][quantity])
+                                ).reshape(1, 1)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            float(properties["properties"][quantity])
+                                        ).reshape(1, 1),
+                                    ),
+                                )
 
-                        quantity = "current gradient"
-                        quantity_o = "dft_total_gradient"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, -1, 3),
+                            quantity = "current gradient"
+                            quantity_o = "dft_total_gradient"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, -1, 3),
+                                    )
                                 )
-                            )
 
-                        quantity = "dispersion correction gradient"
-                        quantity_o = "dispersion_correction_gradient"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, -1, 3),
+                            quantity = "dispersion correction gradient"
+                            quantity_o = "dispersion_correction_gradient"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, -1, 3),
+                                    )
                                 )
-                            )
 
-                        quantity = "scf dipole"
-                        quantity_o = "scf_dipole"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, 3),
+                            quantity = "scf dipole"
+                            quantity_o = "scf_dipole"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, 3),
+                                    )
                                 )
-                            )
 
         # assign units
         for datapoint in self.data:
@@ -564,6 +589,8 @@ def process(
         max_conformers_per_record: Optional[int] = None,
         total_conformers: Optional[int] = None,
         limit_atomic_species: Optional[list] = None,
+        max_force: Optional[unit.Quantity] = None,
+        final_conformer_only=None,
         n_threads=2,
     ) -> None:
         """
@@ -586,7 +613,11 @@ def process(
             Note defining this will only fetch from the "SPICE PubChem Set 1 Single Points Dataset v1.2"
         limit_atomic_species: Optional[list] = None,
             If set to a list of element symbols, records that contain any elements not in this list will be ignored.
-        n_threads, int, default=6
+        max_force: Optional[float], optional, default=None
+            If set this any confirugrations with a force that exceeds this value will be excluded.
+        final_conformer_only: Optional[bool], optional, default=None
+            If set to True, only the final conformer of each record will be processed.
+        n_threads, int, default=2
             Number of concurrent threads for retrieving data from QCArchive
         Examples
         --------
@@ -664,6 +695,8 @@ def process(
             max_conformers_per_record=max_conformers_per_record,
             total_conformers=total_conformers,
             atomic_numbers_to_limit=self.atomic_numbers_to_limit,
+            max_force=max_force,
+            final_conformer_only=final_conformer_only,
         )
 
         self._generate_hdf5()
diff --git a/modelforge/curation/scripts/curate_PhAlkEthOH.py b/modelforge/curation/scripts/curate_PhAlkEthOH.py
@@ -20,6 +20,8 @@ def PhAlkEthOH_openff_wrapper(
     max_conformers_per_record=None,
     total_conformers=None,
     limit_atomic_species=None,
+    max_force=None,
+    final_conformer_only=False,
 ):
     """
     This curates and processes the SPICE 114 dataset at the OpenFF level of theory into an hdf5 file.
@@ -49,7 +51,10 @@ def PhAlkEthOH_openff_wrapper(
     limit_atomic_species: list, optional, default=None
         A list of atomic species to limit the dataset to. Any molecules that contain elements outside of this list
         will be ignored. If not defined, no filtering by atomic species will be performed.
-
+    max_force: float, optional, default=None
+        The maximum force to allow in the dataset. Any conformers with forces greater than this value will be ignored.
+    final_conformer_only: bool, optional, default=False
+        If True, only the final conformer for each molecule will be processed. If False, all conformers will be processed.
 
     """
     from modelforge.curation.phalkethoh_curation import PhAlkEthOHCuration
@@ -67,13 +72,17 @@ def PhAlkEthOH_openff_wrapper(
         total_conformers=total_conformers,
         limit_atomic_species=limit_atomic_species,
         n_threads=1,
+        max_force=max_force,
+        final_conformer_only=final_conformer_only,
     )
     print(f"Total records: {PhAlkEthOH_dataset.total_records}")
     print(f"Total conformers: {PhAlkEthOH_dataset.total_conformers}")
 
 
 def main():
 
+    from openff.units import unit
+
     # define the location where to store and output the files
     import os
 
@@ -83,9 +92,9 @@ def main():
 
     # We'll want to provide some simple means of versioning
     # if we make updates to either the underlying dataset, curation modules, or parameters given to the code
-    version = "0"
+    version = "1"
     # version of the dataset to curate
-    version_select = f"v_{version}"
+    version_select = f"v_0"
 
     # curate dataset with 1000 total conformers, max of 10 conformers per record
     hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000.hdf5"
@@ -99,6 +108,7 @@ def main():
         max_records=1000,
         total_conformers=1000,
         max_conformers_per_record=10,
+        max_force=1.0 * unit.hartree / unit.bohr,
     )
 
     # curate the full dataset
@@ -110,6 +120,36 @@ def main():
         local_cache_dir,
         force_download=False,
         version_select=version_select,
+        max_force=1.0 * unit.hartree / unit.bohr,
+    )
+
+    # curate dataset with 1000 total conformers, max of 10 conformers per record
+    hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000_minimal.hdf5"
+
+    PhAlkEthOH_openff_wrapper(
+        hdf5_file_name,
+        output_file_dir,
+        local_cache_dir,
+        force_download=False,
+        version_select=version_select,
+        max_records=1000,
+        total_conformers=1000,
+        max_conformers_per_record=10,
+        max_force=1.0 * unit.hartree / unit.bohr,
+        final_conformer_only=True,
+    )
+
+    # curate the full dataset
+    hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_minimal.hdf5"
+    print("total dataset")
+    PhAlkEthOH_openff_wrapper(
+        hdf5_file_name,
+        output_file_dir,
+        local_cache_dir,
+        force_download=False,
+        version_select=version_select,
+        max_force=1.0 * unit.hartree / unit.bohr,
+        final_conformer_only=True,
     )
 
 

diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
@@ -488,6 +488,8 @@ def __init__(
             Directory to store the files.
         force_download : bool, optional
             If set to True, the data will be downloaded even if it already exists. Default is False.
+        regenerate_cache : bool, optional
+            If set to True, the cache file will be regenerated even if it already exists. Default is False.
         """
         self.url = url
         self.gz_data_file = gz_data_file