Skip to content

Commit

Permalink
Merge branch 'main' into dev-electrostatics
Browse files Browse the repository at this point in the history
  • Loading branch information
wiederm authored Sep 14, 2024
2 parents 6e70893 + 18c41d6 commit 590f86c
Show file tree
Hide file tree
Showing 41 changed files with 1,292 additions and 737 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -190,3 +190,7 @@ lightning_logs/
*.hdf5
*/tb_logs/*
.vscode/settings.json
logs/*
cache/*
*/logs/*
*/cache/*
5 changes: 5 additions & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
include CODE_OF_CONDUCT.md

include modelforge/dataset/yaml_files/*.yaml
include modelforge/curation/yaml_files/*.yaml
include modelforge/tests/data/potential_defaults/*.toml
include modelforge/tests/data/training_defaults/*.toml

global-exclude *.py[cod] __pycache__ *.so
27 changes: 27 additions & 0 deletions devtools/conda-envs/env.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: modelforge_env
channels:
- conda-forge
- pytorch
dependencies:
# Base depends
- python
- pip
- h5py
- tqdm
- toml
- qcportal>=0.50
- qcelemental
- pytorch>=2.1
- loguru
- lightning>=2.0.8
- tensorboard
- torchvision
- openff-units
- torchmetrics>=1.4
- pint=0.23
- rdkit
- retry
- sqlitedict
- pydantic>=2
- ray-all
- jax
1 change: 1 addition & 0 deletions devtools/conda-envs/test_env.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- pip
- h5py
- tqdm
- toml
- qcportal>=0.50
- qcelemental
- pytorch>=2.1
Expand Down
2 changes: 1 addition & 1 deletion docs/getting_started.rst
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ Here is an example of a training routine definition:
remove_self_energies = true # Whether to remove self-energies from the dataset
batch_size = 128 # Number of samples per batch
lr = 1e-3 # Learning rate for the optimizer
monitor = "val/per_molecule_energy/rmse" # Metric to monitor for early stopping and checkpointing
monitor_for_checkpoint = "val/per_molecule_energy/rmse" # Metric to monitor for checkpointing
[training.experiment_logger]
Expand Down
211 changes: 122 additions & 89 deletions modelforge/curation/phalkethoh_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ def _calculate_total_charge(
rdmol = Chem.MolFromSmiles(smiles, sanitize=False)
total_charge = sum(atom.GetFormalCharge() for atom in rdmol.GetAtoms())

return (int(total_charge) * unit.elementary_charge,)
return int(total_charge) * unit.elementary_charge

def _process_downloaded(
self,
Expand All @@ -277,6 +277,8 @@ def _process_downloaded(
max_conformers_per_record: Optional[int] = None,
total_conformers: Optional[int] = None,
atomic_numbers_to_limit: Optional[List[int]] = None,
max_force: Optional[unit.Quantity] = None,
final_conformer_only: Optional[bool] = None,
):
"""
Processes a downloaded dataset: extracts relevant information.
Expand All @@ -295,6 +297,11 @@ def _process_downloaded(
If set, this will limit the total number of conformers to the specified number.
atomic_numbers_to_limit: Optional[List[int]], optional, default=None
If set, this will limit the dataset to only include molecules with atomic numbers in the list.
max_force: Optional[float], optional, default=None
If set, this will exclude any conformers with a force that exceeds this value.
final_conformer_only: Optional[bool], optional, default=None
If set to True, only the final conformer of each record will be processed. This should be the final
energy minimized conformer.
"""
from tqdm import tqdm
import numpy as np
Expand Down Expand Up @@ -358,7 +365,7 @@ def _process_downloaded(
]
data_temp["n_configs"] = 0

(data_temp["total_charge"],) = self._calculate_total_charge(
data_temp["total_charge"] = self._calculate_total_charge(
data_temp[
"canonical_isomeric_explicit_hydrogen_mapped_smiles"
]
Expand All @@ -377,105 +384,123 @@ def _process_downloaded(
name = key
index = self.molecule_names[name]

if final_conformer_only:
trajectory = [trajectory[-1]]
for state in trajectory:
add_record = True
properties, config = state
self.data[index]["n_configs"] += 1

# note, we will use the convention of names being lowercase
# and spaces denoted by underscore
quantity = "geometry"
quantity_o = "geometry"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = config.reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
config.reshape(1, -1, 3),

# if set, let us see if the configuration has a force that exceeds the maximum
if max_force is not None:
force_magnitude = (
np.abs(
properties["properties"]["current gradient"]
+ properties["properties"][
"dispersion correction gradient"
]
)
* self.qm_parameters["dft_total_force"]["u_in"]
)
if np.any(force_magnitude > max_force):
add_record = False
if add_record:
self.data[index]["n_configs"] += 1

# note, we will use the convention of names being lowercase
# and spaces denoted by underscore
quantity = "geometry"
quantity_o = "geometry"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = config.reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
config.reshape(1, -1, 3),
)
)

# note, we will use the convention of names being lowercase
# and spaces denoted by underscore
quantity = "current energy"
quantity_o = "dft_total_energy"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = properties["properties"][
quantity
]
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
properties["properties"][quantity],
# note, we will use the convention of names being lowercase
# and spaces denoted by underscore
quantity = "current energy"
quantity_o = "dft_total_energy"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = properties["properties"][
quantity
]
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
properties["properties"][quantity],
)
)
)

quantity = "dispersion correction energy"
quantity_o = "dispersion_correction_energy"
# Note need to typecast here because of a bug in the
# qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
float(properties["properties"][quantity])
).reshape(1, 1)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
float(properties["properties"][quantity])
).reshape(1, 1),
),
)
quantity = "dispersion correction energy"
quantity_o = "dispersion_correction_energy"
# Note need to typecast here because of a bug in the
# qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
float(properties["properties"][quantity])
).reshape(1, 1)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
float(properties["properties"][quantity])
).reshape(1, 1),
),
)

quantity = "current gradient"
quantity_o = "dft_total_gradient"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, -1, 3),
quantity = "current gradient"
quantity_o = "dft_total_gradient"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, -1, 3),
)
)
)

quantity = "dispersion correction gradient"
quantity_o = "dispersion_correction_gradient"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, -1, 3),
quantity = "dispersion correction gradient"
quantity_o = "dispersion_correction_gradient"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, -1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, -1, 3),
)
)
)

quantity = "scf dipole"
quantity_o = "scf_dipole"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, 3),
quantity = "scf dipole"
quantity_o = "scf_dipole"
if quantity_o not in self.data[index].keys():
self.data[index][quantity_o] = np.array(
properties["properties"][quantity]
).reshape(1, 3)
else:
self.data[index][quantity_o] = np.vstack(
(
self.data[index][quantity_o],
np.array(
properties["properties"][quantity]
).reshape(1, 3),
)
)
)

# assign units
for datapoint in self.data:
Expand Down Expand Up @@ -564,6 +589,8 @@ def process(
max_conformers_per_record: Optional[int] = None,
total_conformers: Optional[int] = None,
limit_atomic_species: Optional[list] = None,
max_force: Optional[unit.Quantity] = None,
final_conformer_only=None,
n_threads=2,
) -> None:
"""
Expand All @@ -586,7 +613,11 @@ def process(
Note defining this will only fetch from the "SPICE PubChem Set 1 Single Points Dataset v1.2"
limit_atomic_species: Optional[list] = None,
If set to a list of element symbols, records that contain any elements not in this list will be ignored.
n_threads, int, default=6
max_force: Optional[float], optional, default=None
If set this any confirugrations with a force that exceeds this value will be excluded.
final_conformer_only: Optional[bool], optional, default=None
If set to True, only the final conformer of each record will be processed.
n_threads, int, default=2
Number of concurrent threads for retrieving data from QCArchive
Examples
--------
Expand Down Expand Up @@ -664,6 +695,8 @@ def process(
max_conformers_per_record=max_conformers_per_record,
total_conformers=total_conformers,
atomic_numbers_to_limit=self.atomic_numbers_to_limit,
max_force=max_force,
final_conformer_only=final_conformer_only,
)

self._generate_hdf5()
Loading

0 comments on commit 590f86c

Please sign in to comment.