Skip to content

Commit

Permalink
Merge pull request #245 from chrisiacovella/phalkethoh_datasets
Browse files Browse the repository at this point in the history
Updated Phalkethoh datasets
  • Loading branch information
chrisiacovella authored Aug 30, 2024
2 parents 83bd334 + bf1d04c commit 5551bea
Show file tree
Hide file tree
Showing 5 changed files with 230 additions and 95 deletions.
211 changes: 122 additions & 89 deletions modelforge/curation/phalkethoh_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def _calculate_total_charge(
rdmol = Chem.MolFromSmiles(smiles, sanitize=False)
total_charge = sum(atom.GetFormalCharge() for atom in rdmol.GetAtoms())

return (int(total_charge) * unit.elementary_charge,)
return int(total_charge) * unit.elementary_charge

def _process_downloaded(
self,
Expand All @@ -277,6 +277,8 @@ def _process_downloaded(
max_conformers_per_record: Optional[int] = None,
total_conformers: Optional[int] = None,
atomic_numbers_to_limit: Optional[List[int]] = None,
max_force: Optional[unit.Quantity] = None,
final_conformer_only: Optional[bool] = None,
):
"""
Processes a downloaded dataset: extracts relevant information.
Expand All @@ -295,6 +297,11 @@ def _process_downloaded(
If set, this will limit the total number of conformers to the specified number.
atomic_numbers_to_limit: Optional[List[int]], optional, default=None
If set, this will limit the dataset to only include molecules with atomic numbers in the list.
max_force: Optional[float], optional, default=None
If set, this will exclude any conformers with a force that exceeds this value.
final_conformer_only: Optional[bool], optional, default=None
If set to True, only the final conformer of each record will be processed. This should be the final
energy minimized conformer.
"""
from tqdm import tqdm
import numpy as np
Expand Down Expand Up @@ -358,7 +365,7 @@ def _process_downloaded(
]
data_temp["n_configs"] = 0

(data_temp["total_charge"],) = self._calculate_total_charge(
data_temp["total_charge"] = self._calculate_total_charge(
data_temp[
"canonical_isomeric_explicit_hydrogen_mapped_smiles"
]
Expand All @@ -377,105 +384,123 @@ def _process_downloaded(
name = key
index = self.molecule_names[name]

if final_conformer_only:
trajectory = [trajectory[-1]]
for state in trajectory:
add_record = True
properties, config = state
self.data[index]["n_configs"] += 1

# note, we will use the convention of names being lowercase
# and spaces denoted by underscore
quantity = "geometry"
quantity_o = "geometry"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = config.reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
config.reshape(1, -1, 3),

# if set, let us see if the configuration has a force that exceeds the maximum
if max_force is not None:
force_magnitude = (
np.abs(
properties["properties"]["current gradient"]
+ properties["properties"][
"dispersion correction gradient"
]
)
* self.qm_parameters["dft_total_force"]["u_in"]
)
if np.any(force_magnitude > max_force):
add_record = False
if add_record:
self.data[index]["n_configs"] += 1

# note, we will use the convention of names being lowercase
# and spaces denoted by underscore
quantity = "geometry"
quantity_o = "geometry"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = config.reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
config.reshape(1, -1, 3),
)
)

# note, we will use the convention of names being lowercase
# and spaces denoted by underscore
quantity = "current energy"
quantity_o = "dft_total_energy"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = properties["properties"][
quantity
]
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
properties["properties"][quantity],
# note, we will use the convention of names being lowercase
# and spaces denoted by underscore
quantity = "current energy"
quantity_o = "dft_total_energy"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = properties["properties"][
quantity
]
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
properties["properties"][quantity],
)
)
)

quantity = "dispersion correction energy"
quantity_o = "dispersion_correction_energy"
# Note need to typecast here because of a bug in the
# qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
float(properties["properties"][quantity])
).reshape(1, 1)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
float(properties["properties"][quantity])
).reshape(1, 1),
),
)
quantity = "dispersion correction energy"
quantity_o = "dispersion_correction_energy"
# Note need to typecast here because of a bug in the
# qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
float(properties["properties"][quantity])
).reshape(1, 1)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
float(properties["properties"][quantity])
).reshape(1, 1),
),
)

quantity = "current gradient"
quantity_o = "dft_total_gradient"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, -1, 3),
quantity = "current gradient"
quantity_o = "dft_total_gradient"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, -1, 3),
)
)
)

quantity = "dispersion correction gradient"
quantity_o = "dispersion_correction_gradient"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, -1, 3),
quantity = "dispersion correction gradient"
quantity_o = "dispersion_correction_gradient"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, -1, 3),
)
)
)

quantity = "scf dipole"
quantity_o = "scf_dipole"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, 3),
quantity = "scf dipole"
quantity_o = "scf_dipole"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, 3),
)
)
)

# assign units
for datapoint in self.data:
Expand Down Expand Up @@ -564,6 +589,8 @@ def process(
max_conformers_per_record: Optional[int] = None,
total_conformers: Optional[int] = None,
limit_atomic_species: Optional[list] = None,
max_force: Optional[unit.Quantity] = None,
final_conformer_only=None,
n_threads=2,
) -> None:
"""
Expand All @@ -586,7 +613,11 @@ def process(
Note defining this will only fetch from the "SPICE PubChem Set 1 Single Points Dataset v1.2"
limit_atomic_species: Optional[list] = None,
If set to a list of element symbols, records that contain any elements not in this list will be ignored.
n_threads, int, default=6
max_force: Optional[float], optional, default=None
If set this any confirugrations with a force that exceeds this value will be excluded.
final_conformer_only: Optional[bool], optional, default=None
If set to True, only the final conformer of each record will be processed.
n_threads, int, default=2
Number of concurrent threads for retrieving data from QCArchive
Examples
--------
Expand Down Expand Up @@ -664,6 +695,8 @@ def process(
max_conformers_per_record=max_conformers_per_record,
total_conformers=total_conformers,
atomic_numbers_to_limit=self.atomic_numbers_to_limit,
max_force=max_force,
final_conformer_only=final_conformer_only,
)

self._generate_hdf5()
46 changes: 43 additions & 3 deletions modelforge/curation/scripts/curate_PhAlkEthOH.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ def PhAlkEthOH_openff_wrapper(
max_conformers_per_record=None,
total_conformers=None,
limit_atomic_species=None,
max_force=None,
final_conformer_only=False,
):
"""
This curates and processes the SPICE 114 dataset at the OpenFF level of theory into an hdf5 file.
Expand Down Expand Up @@ -49,7 +51,10 @@ def PhAlkEthOH_openff_wrapper(
limit_atomic_species: list, optional, default=None
A list of atomic species to limit the dataset to. Any molecules that contain elements outside of this list
will be ignored. If not defined, no filtering by atomic species will be performed.
max_force: float, optional, default=None
The maximum force to allow in the dataset. Any conformers with forces greater than this value will be ignored.
final_conformer_only: bool, optional, default=False
If True, only the final conformer for each molecule will be processed. If False, all conformers will be processed.
"""
from modelforge.curation.phalkethoh_curation import PhAlkEthOHCuration
Expand All @@ -67,13 +72,17 @@ def PhAlkEthOH_openff_wrapper(
total_conformers=total_conformers,
limit_atomic_species=limit_atomic_species,
n_threads=1,
max_force=max_force,
final_conformer_only=final_conformer_only,
)
print(f"Total records: {PhAlkEthOH_dataset.total_records}")
print(f"Total conformers: {PhAlkEthOH_dataset.total_conformers}")


def main():

from openff.units import unit

# define the location where to store and output the files
import os

Expand All @@ -83,9 +92,9 @@ def main():

# We'll want to provide some simple means of versioning
# if we make updates to either the underlying dataset, curation modules, or parameters given to the code
version = "0"
version = "1"
# version of the dataset to curate
version_select = f"v_{version}"
version_select = f"v_0"

# curate dataset with 1000 total conformers, max of 10 conformers per record
hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000.hdf5"
Expand All @@ -99,6 +108,7 @@ def main():
max_records=1000,
total_conformers=1000,
max_conformers_per_record=10,
max_force=1.0 * unit.hartree / unit.bohr,
)

# curate the full dataset
Expand All @@ -110,6 +120,36 @@ def main():
local_cache_dir,
force_download=False,
version_select=version_select,
max_force=1.0 * unit.hartree / unit.bohr,
)

# curate dataset with 1000 total conformers, max of 10 conformers per record
hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000_minimal.hdf5"

PhAlkEthOH_openff_wrapper(
hdf5_file_name,
output_file_dir,
local_cache_dir,
force_download=False,
version_select=version_select,
max_records=1000,
total_conformers=1000,
max_conformers_per_record=10,
max_force=1.0 * unit.hartree / unit.bohr,
final_conformer_only=True,
)

# curate the full dataset
hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_minimal.hdf5"
print("total dataset")
PhAlkEthOH_openff_wrapper(
hdf5_file_name,
output_file_dir,
local_cache_dir,
force_download=False,
version_select=version_select,
max_force=1.0 * unit.hartree / unit.bohr,
final_conformer_only=True,
)


Expand Down
2 changes: 2 additions & 0 deletions modelforge/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,8 @@ def __init__(
Directory to store the files.
force_download : bool, optional
If set to True, the data will be downloaded even if it already exists. Default is False.
regenerate_cache : bool, optional
If set to True, the cache file will be regenerated even if it already exists. Default is False.
"""
self.url = url
self.gz_data_file = gz_data_file
Expand Down
Loading

0 comments on commit 5551bea

Please sign in to comment.