From 1357d28afe8c45bf48bb55cd5b822182c1f7ddbe Mon Sep 17 00:00:00 2001 From: wiederm Date: Mon, 19 Aug 2024 14:20:38 +0200 Subject: [PATCH 01/66] bugfix for physnet interaction --- modelforge/potential/physnet.py | 165 ++++++++++++++++---------------- 1 file changed, 83 insertions(+), 82 deletions(-) diff --git a/modelforge/potential/physnet.py b/modelforge/potential/physnet.py index 1bbe69b2..206c5df0 100644 --- a/modelforge/potential/physnet.py +++ b/modelforge/potential/physnet.py @@ -3,32 +3,40 @@ """ from dataclasses import dataclass, field -from typing import Dict, Optional, Union, List, Dict, Type +from typing import Dict, List, Optional, Type, Union import torch from loguru import logger as log from openff.units import unit from torch import nn -from .models import PairListOutputs, NNPInput, BaseNetwork, CoreNetwork -from modelforge.potential.utils import NeuralNetworkData +from modelforge.potential.utils import NeuralNetworkData, shared_config_prior +from modelforge.utils.io import import_ +from modelforge.utils.units import _convert_str_to_unit + +from .models import BaseNetwork, CoreNetwork, NNPInput, PairListOutputs +from .utils import Dense @dataclass class PhysNetNeuralNetworkData(NeuralNetworkData): """ - A dataclass to structure the inputs for PhysNet-based neural network potentials, - facilitating the efficient and structured representation of atomic systems for - energy computation and property prediction within the PhysNet framework. + A dataclass to structure the inputs for PhysNet-based neural network + potentials, facilitating the efficient and structured representation of + atomic systems for energy computation and property prediction within the + PhysNet framework. Attributes ---------- atomic_embedding : torch.Tensor - A 2D tensor containing embeddings or features for each atom, derived from atomic numbers or other properties. - Shape: [num_atoms, embedding_dim]. + A 2D tensor containing embeddings or features for each atom, derived + from atomic numbers or other properties. Shape: [num_atoms, + embedding_dim]. f_ij : Optional[torch.Tensor] - A tensor representing the radial basis function (RBF) expansion applied to distances between atom pairs, - capturing the local chemical environment. Will be added after initialization. Shape: [num_pairs, num_rbf]. + A tensor representing the radial basis function (RBF) expansion applied + to distances between atom pairs, capturing the local chemical + environment. Will be added after initialization. Shape: [num_pairs, + num_rbf]. """ atomic_embedding: Optional[torch.Tensor] = field(default=None) @@ -64,9 +72,10 @@ def __init__( self.cutoff_module = CosineAttenuationFunction(maximum_interaction_radius) # Initialize radial symmetry function module - from .utils import PhysNetRadialBasisFunction from modelforge.potential.utils import FeaturizeInput + from .utils import PhysNetRadialBasisFunction + self.featurize_input = FeaturizeInput(featurization_config) self.radial_symmetry_function_module = PhysNetRadialBasisFunction( @@ -117,13 +126,15 @@ def forward(self, x: torch.Tensor, activation_fn: bool = False) -> torch.Tensor: """ Apply gating to the input tensor. - Parameters: - ----------- + Parameters + ---------- x : torch.Tensor The input tensor to gate. + activation_fn : bool, optional + Whether to apply an activation function, by default False. - Returns: - -------- + Returns + ------- torch.Tensor The gated input tensor. """ @@ -131,21 +142,23 @@ def forward(self, x: torch.Tensor, activation_fn: bool = False) -> torch.Tensor: return gating_signal * x -from .utils import DenseWithCustomDist - - class PhysNetResidual(nn.Module): """ - Implements a preactivation residual block as described in Equation 4 of the PhysNet paper. + Implements a preactivation residual block as described in Equation 4 of the + PhysNet paper. - The block refines atomic feature vectors by adding a residual component computed through two linear transformations and a non-linear activation function (Softplus). This setup enhances gradient flow and supports effective deep network training by employing a preactivation scheme. + The block refines atomic feature vectors by adding a residual component + computed through two linear transformations and a non-linear activation + function (Softplus). This setup enhances gradient flow and supports + effective deep network training by employing a preactivation scheme. Parameters ---------- input_dim : int Dimensionality of the input feature vector. output_dim : int - Dimensionality of the output feature vector, which typically matches the input dimension. + Dimensionality of the output feature vector, which typically matches the + input dimension. activation_function : Type[torch.nn.Module] The activation function to be used in the residual block. """ @@ -158,27 +171,28 @@ def __init__( ): super().__init__() # Initialize dense layers and residual connection - self.dense = DenseWithCustomDist( - input_dim, output_dim, activation_function=activation_function + + self.dense = nn.Sequential( + activation_function, + Dense(input_dim, output_dim, activation_function), + Dense(output_dim, output_dim), ) - self.residual = DenseWithCustomDist(output_dim, output_dim) def forward(self, x: torch.Tensor) -> torch.Tensor: """ Forward pass of the ResidualBlock. - Parameters: - ----------- - x: torch.Tensor + Parameters + ---------- + x : torch.Tensor Input tensor containing feature vectors of atoms. - Returns: - -------- + Returns + ------- torch.Tensor Output tensor after applying the residual block operations. """ - # update x with residual - return x + self.residual(self.dense(x)) + return x + self.dense(x) class PhysNetInteractionModule(nn.Module): @@ -219,21 +233,19 @@ def __init__( ) # Initialize networks for processing atomic embeddings of i and j atoms - self.interaction_i = DenseWithCustomDist( + self.interaction_i = Dense( number_of_per_atom_features, number_of_per_atom_features, activation_function=activation_function, ) - self.interaction_j = DenseWithCustomDist( + self.interaction_j = Dense( number_of_per_atom_features, number_of_per_atom_features, activation_function=activation_function, ) # Initialize processing network - self.process_v = DenseWithCustomDist( - number_of_per_atom_features, number_of_per_atom_features - ) + self.process_v = Dense(number_of_per_atom_features, number_of_per_atom_features) # Initialize residual blocks self.residuals = nn.ModuleList( @@ -253,68 +265,65 @@ def __init__( def forward(self, data: PhysNetNeuralNetworkData) -> torch.Tensor: """ - Processes input tensors through the interaction module, applying Gaussian Logarithm Attention to modulate - the influence of pairwise distances on the interaction features, followed by aggregation to update atomic embeddings. + Processes input tensors through the interaction module, applying + Gaussian Logarithm Attention to modulate the influence of pairwise + distances on the interaction features, followed by aggregation to update + atomic embeddings. Parameters ---------- data : PhysNetNeuralNetworkData - Input data containing pair indices, distances, and atomic embeddings. + Input data containing pair indices, distances, and atomic + embeddings. Returns ------- torch.Tensor - Updated atomic feature representations incorporating interaction information. + Updated atomic feature representations incorporating interaction + information. """ - # Equation 6: Formation of the Proto-Message ṽ_i for an Atom i ṽ_i = - # σ(Wl_I * x_i^l + bl_I) + Σ_j (G_g * Wl * (σ(σl_J * x_j^l + bl_J)) * - # g(r_ij)) - # - # Equation 6 implementation overview: ṽ_i = x_i_prime + - # sum_over_j(x_j_prime * f_ij_prime) where: - # - x_i_prime and x_j_prime are the features of atoms i and j, - # respectively, processed through separate networks. - # - f_ij_prime represents the modulated radial basis functions (f_ij) by - # the Gaussian Logarithm Attention weights. # extract relevant variables - idx_i, idx_j = data.pair_indices - f_ij = data.f_ij - x = data.atomic_embedding + idx_i, idx_j = data.pair_indices # (nr_of_pairs, 2) + f_ij = data.f_ij # (nr_of_pairs, number_of_radial_basis_functions) # # Apply activation to atomic embeddings - xa = self.dropout(self.activation_function(x)) + per_atom_embedding = self.activation_function( + data.atomic_embedding + ) # (nr_of_atoms_in_batch, number_of_per_atom_features) # calculate attention weights and transform to # input shape: (number_of_pairs, number_of_radial_basis_functions) # output shape: (number_of_pairs, number_of_per_atom_features) g = self.attention_mask(f_ij) - # Calculate contribution of central atom - x_i = self.interaction_i(xa) + # Calculate contribution of central atom i + per_atom_updated_embedding = self.interaction_i(per_atom_embedding) + # Calculate contribution of neighbor atom - x_j = self.interaction_j(xa) - # Gather the results according to idx_j - x_j = x_j[idx_j] - # Multiply the gathered features by g - x_j_modulated = x_j * g - # Aggregate modulated contributions for each atom i - x_j_prime = torch.zeros_like(x_i) - x_j_prime.scatter_add_( - 0, idx_i.unsqueeze(-1).expand(-1, x_j_modulated.size(-1)), x_j_modulated + per_interaction_embededding_for_atom_j = ( + self.interaction_j(per_atom_embedding[idx_j]) * g + ) + + per_atom_updated_embedding.scatter_add_( + 0, + idx_i.unsqueeze(-1).expand( + -1, per_interaction_embededding_for_atom_j.shape[-1] + ), + per_interaction_embededding_for_atom_j, ) - # Draft proto message v_tilde - m = x_i + x_j_prime - # shape of m (nr_of_atoms_in_batch, 1) - # Equation 4: Preactivation Residual Block Implementation - # xl+2_i = xl_i + Wl+1 * sigma(Wl * xl_i + bl) + bl+1 + # apply residual blocks for residual in self.residuals: - m = residual( - m + per_atom_updated_embedding = residual( + per_atom_updated_embedding ) # shape (nr_of_atoms_in_batch, number_of_radial_basis_functions) - m = self.activation_function(m) - x = self.gate * x + self.process_v(m) + + per_atom_updated_embedding = self.activation_function( + per_atom_updated_embedding + ) + + x = self.gate * per_atom_embedding + self.process_v(per_atom_updated_embedding) return x @@ -359,7 +368,6 @@ def __init__( number_of_per_atom_features, number_of_atomic_properties, weight_init=torch.nn.init.zeros_, - bias=False, ) def forward(self, x: torch.Tensor) -> torch.Tensor: @@ -645,13 +653,6 @@ def compute_properties( } -from .models import NNPInput, BaseNetwork -from typing import List -from modelforge.utils.units import _convert_str_to_unit -from modelforge.utils.io import import_ -from modelforge.potential.utils import shared_config_prior - - class PhysNet(BaseNetwork): """ Implementation of the PhysNet neural network potential. From 21d469e91bb83505d8c176e97dc56a982729d633 Mon Sep 17 00:00:00 2001 From: wiederm Date: Mon, 19 Aug 2024 14:42:54 +0200 Subject: [PATCH 02/66] minor changes --- modelforge/potential/physnet.py | 39 ++++----------------------------- 1 file changed, 4 insertions(+), 35 deletions(-) diff --git a/modelforge/potential/physnet.py b/modelforge/potential/physnet.py index 206c5df0..7238eaaa 100644 --- a/modelforge/potential/physnet.py +++ b/modelforge/potential/physnet.py @@ -109,39 +109,6 @@ def forward(self, data: Type[PhysNetNeuralNetworkData]) -> Dict[str, torch.Tenso } -class GatingModule(nn.Module): - def __init__(self, number_of_atom_basis: int): - """ - Initializes a gating module that optionally applies a sigmoid gating mechanism to input features. - - Parameters - ---------- - number_of_atom_basis : int - The dimensionality of the input (and output) features. - """ - super().__init__() - self.gate = nn.Parameter(torch.ones(number_of_atom_basis)) - - def forward(self, x: torch.Tensor, activation_fn: bool = False) -> torch.Tensor: - """ - Apply gating to the input tensor. - - Parameters - ---------- - x : torch.Tensor - The input tensor to gate. - activation_fn : bool, optional - Whether to apply an activation function, by default False. - - Returns - ------- - torch.Tensor - The gated input tensor. - """ - gating_signal = torch.sigmoid(self.gate) - return gating_signal * x - - class PhysNetResidual(nn.Module): """ Implements a preactivation residual block as described in Equation 4 of the @@ -323,8 +290,10 @@ def forward(self, data: PhysNetNeuralNetworkData) -> torch.Tensor: per_atom_updated_embedding ) - x = self.gate * per_atom_embedding + self.process_v(per_atom_updated_embedding) - return x + per_atom_embedding = self.gate * per_atom_embedding + self.process_v( + per_atom_updated_embedding + ) + return per_atom_embedding class PhysNetOutput(nn.Module): From 18b42cd5c370c7826d41d999d29f360321f6f44f Mon Sep 17 00:00:00 2001 From: wiederm Date: Mon, 19 Aug 2024 14:45:59 +0200 Subject: [PATCH 03/66] update docstrings --- modelforge/potential/physnet.py | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/modelforge/potential/physnet.py b/modelforge/potential/physnet.py index 7238eaaa..20bc9ac4 100644 --- a/modelforge/potential/physnet.py +++ b/modelforge/potential/physnet.py @@ -410,25 +410,6 @@ def forward(self, data: PhysNetNeuralNetworkData) -> Dict[str, torch.Tensor]: Dict[str, torch.Tensor] Dictionary containing predictions and updated embeddings. """ - # The PhysNet module is a sequence of interaction modules and residual modules. - # x_1, ..., x_N - # | - # v - # ┌─────────────┐ - # │ interaction │ <-- g(d_ij) - # └─────────────┘ - # │ - # v - # ┌───────────┐ - # │ residual │ - # └───────────┘ - # ┌───────────┐ - # │ residual │ - # └───────────┘ - # ┌───────────┐ │ - # │ output │<-----│ - # └───────────┘ │ - # v # calculate the interaction v = self.interaction(data) From b5f9c85f8d483775ba869fa46874c2d1f4bb5f83 Mon Sep 17 00:00:00 2001 From: wiederm Date: Mon, 19 Aug 2024 15:03:22 +0200 Subject: [PATCH 04/66] fix --- modelforge/potential/painn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelforge/potential/painn.py b/modelforge/potential/painn.py index 0d1925fc..d70586f2 100644 --- a/modelforge/potential/painn.py +++ b/modelforge/potential/painn.py @@ -221,7 +221,7 @@ def compute_properties( ) results = { - "per_atom_scalar_representation": per_atom_scalar_feature, + "per_atom_scalar_representation": per_atom_scalar_feature.squeeze(1), "per_atom_vector_representation": per_atom_vector_feature, "atomic_subsystem_indices": data.atomic_subsystem_indices, } From 861aadf7cbf68b8bb7dee8c0bd10ac4dac45f14d Mon Sep 17 00:00:00 2001 From: wiederm Date: Mon, 19 Aug 2024 20:18:57 +0200 Subject: [PATCH 05/66] save_dir was defined in runtime and training.toml --- modelforge/tests/data/runtime_defaults/runtime.toml | 1 - modelforge/train/parameters.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/modelforge/tests/data/runtime_defaults/runtime.toml b/modelforge/tests/data/runtime_defaults/runtime.toml index 238ef10c..0dd5c758 100644 --- a/modelforge/tests/data/runtime_defaults/runtime.toml +++ b/modelforge/tests/data/runtime_defaults/runtime.toml @@ -1,5 +1,4 @@ [runtime] -save_dir = "lightning_logs" experiment_name = "{potential_name}_{dataset_name}" local_cache_dir = "./cache" accelerator = "cpu" diff --git a/modelforge/train/parameters.py b/modelforge/train/parameters.py index 95bf3569..8fe6c2c9 100644 --- a/modelforge/train/parameters.py +++ b/modelforge/train/parameters.py @@ -309,7 +309,6 @@ class RuntimeParameters(ParametersBase): A class to hold the runtime parameters that inherits from the pydantic BaseModel args: - save_dir (str): The save directory experiment_name (str): The experiment name accelerator (Accelerator): The accelerator, options are: "cpu", "gpu", "tpu" number_of_nodes (int): The number of nodes @@ -322,7 +321,6 @@ class RuntimeParameters(ParametersBase): """ - save_dir: str experiment_name: str accelerator: Accelerator number_of_nodes: int From 90668ed359ffe0f1295024e8dd49c1ebfa8b384f Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Mon, 19 Aug 2024 13:42:13 -0700 Subject: [PATCH 06/66] enable multiple cutoffs. neighborlist accepts a dict that can contain multiple cutoffs and returns dict with the same keys. --- modelforge/potential/ani.py | 21 +++++--- modelforge/potential/models.py | 85 +++++++++++++++++++++++-------- modelforge/potential/painn.py | 11 ++-- modelforge/potential/physnet.py | 9 +++- modelforge/potential/sake.py | 11 ++-- modelforge/potential/schnet.py | 13 +++-- modelforge/potential/tensornet.py | 11 ++-- modelforge/tests/test_models.py | 75 ++++++++++++++++++++++----- 8 files changed, 180 insertions(+), 56 deletions(-) diff --git a/modelforge/potential/ani.py b/modelforge/potential/ani.py index 2e84ff13..32da49ae 100644 --- a/modelforge/potential/ani.py +++ b/modelforge/potential/ani.py @@ -2,6 +2,9 @@ This module contains the classes for the ANI2x neural network potential. """ +from __future__ import annotations + + from dataclasses import dataclass from typing import TYPE_CHECKING, Dict, Tuple, Type from .models import BaseNetwork, CoreNetwork @@ -12,9 +15,10 @@ from modelforge.utils.prop import SpeciesAEV -if TYPE_CHECKING: - from modelforge.dataset.dataset import NNPInput - from .models import PairListOutputs + +# if TYPE_CHECKING: +# from modelforge.dataset.dataset import NNPInput +# from .models import PairListOutputs def triu_index(num_species: int) -> torch.Tensor: @@ -561,7 +565,7 @@ def __init__( self.register_buffer("lookup_tensor", lookup_tensor) def _model_specific_input_preparation( - self, data: "NNPInput", pairlist_output: "PairListOutputs" + self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs] ) -> AniNeuralNetworkData: """ Prepare the model-specific input data for the ANI2x model. @@ -570,8 +574,8 @@ def _model_specific_input_preparation( ---------- data : NNPInput The input data for the model. - pairlist_output : PairListOutputs - The pairlist output. + pairlist_output : Dict[str,PairListOutputs] + The output from the pairlist. Returns ------- @@ -580,6 +584,11 @@ def _model_specific_input_preparation( """ number_of_atoms = data.atomic_numbers.shape[0] + # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter + # e.g. "maximum_interaction_radius" + + pairlist_output = pairlist_output["maximum_interaction_radius"] + nnp_data = AniNeuralNetworkData( pair_indices=pairlist_output.pair_indices, d_ij=pairlist_output.d_ij, diff --git a/modelforge/potential/models.py b/modelforge/potential/models.py index 5285fa4c..27dcd4f2 100644 --- a/modelforge/potential/models.py +++ b/modelforge/potential/models.py @@ -311,24 +311,31 @@ def forward( class Neighborlist(Pairlist): """ - Manage neighbor list calculations with a specified cutoff distance. + Manage neighbor list calculations with a specified cutoff distance(s). This class extends Pairlist to consider a cutoff distance for neighbor calculations. """ - def __init__(self, cutoff: unit.Quantity, only_unique_pairs: bool = False): + def __init__( + self, cutoffs: Dict[str, unit.Quantity], only_unique_pairs: bool = False + ): """ Initialize the Neighborlist with a specific cutoff distance. Parameters ---------- - cutoff : unit.Quantity - Cutoff distance for neighbor calculations. + cutoffs : Dict[str, unit.Quantity] + Cutoff distances for neighbor calculations. only_unique_pairs : bool, optional If True, only unique pairs are returned (default is False). """ super().__init__(only_unique_pairs=only_unique_pairs) - self.register_buffer("cutoff", torch.tensor(cutoff.to(unit.nanometer).m)) + + # self.register_buffer("cutoff", torch.tensor(cutoff.to(unit.nanometer).m)) + self.register_buffer( + "cutoffs", torch.tensor([c.to(unit.nanometer).m for c in cutoffs.values()]) + ) + self.labels = list(cutoffs.keys()) def forward( self, @@ -364,16 +371,26 @@ def forward( r_ij = self.calculate_r_ij(pair_indices, positions) d_ij = self.calculate_d_ij(r_ij) - # Find pairs within the cutoff - in_cutoff = (d_ij <= self.cutoff).squeeze() - # Get the atom indices within the cutoff - pair_indices_within_cutoff = pair_indices[:, in_cutoff] + interacting_outputs = {} + for cutoff, label in zip(self.cutoffs, self.labels): + # Find pairs within the cutoff + in_cutoff = (d_ij <= cutoff).squeeze() + # Get the atom indices within the cutoff + pair_indices_within_cutoff = pair_indices[:, in_cutoff] + + interacting_outputs[label] = PairListOutputs( + pair_indices=pair_indices_within_cutoff, + d_ij=d_ij[in_cutoff], + r_ij=r_ij[in_cutoff], + ) - return PairListOutputs( - pair_indices=pair_indices_within_cutoff, - d_ij=d_ij[in_cutoff], - r_ij=r_ij[in_cutoff], - ) + return interacting_outputs + # + # return PairListOutputs( + # pair_indices=pair_indices_within_cutoff, + # d_ij=d_ij[in_cutoff], + # r_ij=r_ij[in_cutoff], + # ) from typing import Callable, Literal, Optional, Union @@ -666,14 +683,16 @@ class ComputeInteractingAtomPairs(torch.nn.Module): distances (d_ij), and displacement vectors (r_ij) for molecular simulations. """ - def __init__(self, cutoff: unit.Quantity, only_unique_pairs: bool = True): + def __init__( + self, cutoffs: Dict[str, unit.Quantity], only_unique_pairs: bool = True + ): """ Initialize the ComputeInteractingAtomPairs module. Parameters ---------- - cutoff : unit.Quantity - The cutoff distance for neighbor list calculations. + cutoffs : Dict[str, unit.Quantity] + The cutoff distance(s) for neighbor list calculations. only_unique_pairs : bool, optional Whether to only use unique pairs in the pair list calculation, by default True. This should be set to True for all message passing @@ -684,7 +703,7 @@ def __init__(self, cutoff: unit.Quantity, only_unique_pairs: bool = True): from .models import Neighborlist self.only_unique_pairs = only_unique_pairs - self.calculate_distances_and_pairlist = Neighborlist(cutoff, only_unique_pairs) + self.calculate_distances_and_pairlist = Neighborlist(cutoffs, only_unique_pairs) def prepare_inputs(self, data: Union[NNPInput, NamedTuple]): """ @@ -703,7 +722,7 @@ def prepare_inputs(self, data: Union[NNPInput, NamedTuple]): Returns ------- PairListOutputs - A namedtuple containing the pair indices, Euclidean distances + A Dict for each cutoff type, where each entry is a namedtuple containing the pair indices, Euclidean distances (d_ij), and displacement vectors (r_ij). """ # --------------------------- @@ -736,6 +755,7 @@ def prepare_inputs(self, data: Union[NNPInput, NamedTuple]): pair_indices=pair_list.to(torch.int64), ) + # this will return a Dict of the PairListOutputs for each cutoff we specify return pairlist_output def _input_checks(self, data: Union[NNPInput, NamedTuple]): @@ -994,6 +1014,8 @@ def __init__( postprocessing_parameter: Dict[str, Dict[str, bool]], dataset_statistic: Optional[Dict[str, float]], maximum_interaction_radius: unit.Quantity, + maximum_dispersion_interaction_radius: Optional[unit.Quantity] = None, + maximum_coulomb_interaction_radius: Optional[unit.Quantity] = None, potential_seed: Optional[int] = None, ): """ @@ -1006,7 +1028,11 @@ def __init__( dataset_statistic : Optional[Dict[str, float]] Dataset statistics for normalization. maximum_interaction_radius : unit.Quantity - cutoff radius. + cutoff radius for local interactions + maximum_dispersion_interaction_radius : unit.Quantity, optional + cutoff radius for dispersion interactions. + maximum_coulomb_interaction_radius : unit.Quantity, optional + cutoff radius for Coulomb interactions. potential_seed : Optional[int], optional Value used for torch.manual_seed, by default None. """ @@ -1040,8 +1066,25 @@ def __init__( raise RuntimeError( "The only_unique_pairs attribute is not set in the child class. Please set it to True or False before calling super().__init__." ) + + # to handle multiple cutoffs, we will create a dictionary with the cutoffs + # the dictionary will make it more transparent which PairListOutputs belong to which cutoff + + cutoffs = {} + cutoffs["maximum_interaction_radius"] = _convert_str_to_unit( + maximum_interaction_radius + ) + if maximum_dispersion_interaction_radius is not None: + cutoffs["maximum_dispersion_interaction_radius"] = _convert_str_to_unit( + maximum_dispersion_interaction_radius + ) + if maximum_coulomb_interaction_radius is not None: + cutoffs["maximum_coulomb_interaction_radius"] = _convert_str_to_unit( + maximum_coulomb_interaction_radius + ) + self.compute_interacting_pairs = ComputeInteractingAtomPairs( - cutoff=_convert_str_to_unit(maximum_interaction_radius), + cutoffs=cutoffs, only_unique_pairs=self.only_unique_pairs, ) diff --git a/modelforge/potential/painn.py b/modelforge/potential/painn.py index c90a5f1a..6d297530 100644 --- a/modelforge/potential/painn.py +++ b/modelforge/potential/painn.py @@ -138,7 +138,7 @@ def __init__( ) def _model_specific_input_preparation( - self, data: NNPInput, pairlist_output: PairListOutputs + self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs] ) -> PaiNNNeuralNetworkData: """ Prepare the model-specific input for the PaiNN network. @@ -147,8 +147,8 @@ def _model_specific_input_preparation( ---------- data : NNPInput The input data. - pairlist_output : PairListOutputs - The pairlist output. + pairlist_output : dict[str, PairListOutputs] + The output from the pairlist. Returns ------- @@ -159,6 +159,11 @@ def _model_specific_input_preparation( number_of_atoms = data.atomic_numbers.shape[0] + # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter + # e.g. "maximum_interaction_radius" + + pairlist_output = pairlist_output["maximum_interaction_radius"] + nnp_input = PaiNNNeuralNetworkData( pair_indices=pairlist_output.pair_indices, d_ij=pairlist_output.d_ij, diff --git a/modelforge/potential/physnet.py b/modelforge/potential/physnet.py index 20bc9ac4..ee20721d 100644 --- a/modelforge/potential/physnet.py +++ b/modelforge/potential/physnet.py @@ -486,7 +486,7 @@ def __init__( self.atomic_shift = nn.Parameter(torch.zeros(maximum_atomic_number, 2)) def _model_specific_input_preparation( - self, data: "NNPInput", pairlist_output: "PairListOutputs" + self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs] ) -> PhysNetNeuralNetworkData: """ Prepare model-specific input data. @@ -495,7 +495,7 @@ def _model_specific_input_preparation( ---------- data : NNPInput Input data containing atomic information. - pairlist_output : PairListOutputs + pairlist_output : Dict[str, PairListOutputs] Output from the pairlist calculation. Returns @@ -505,6 +505,11 @@ def _model_specific_input_preparation( """ number_of_atoms = data.atomic_numbers.shape[0] + # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter + # e.g. "maximum_interaction_radius" + + pairlist_output = pairlist_output["maximum_interaction_radius"] + nnp_input = PhysNetNeuralNetworkData( pair_indices=pairlist_output.pair_indices, d_ij=pairlist_output.d_ij, diff --git a/modelforge/potential/sake.py b/modelforge/potential/sake.py index bed45f2e..49f5a48e 100644 --- a/modelforge/potential/sake.py +++ b/modelforge/potential/sake.py @@ -138,7 +138,7 @@ def __init__( ) def _model_specific_input_preparation( - self, data: "NNPInput", pairlist_output: "PairListOutputs" + self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs] ) -> SAKENeuralNetworkInput: """ Prepare the model-specific input. @@ -147,8 +147,8 @@ def _model_specific_input_preparation( ---------- data : NNPInput Input data. - pairlist_output : PairListOutputs - Pairlist output. + pairlist_output : Dict[str,PairListOutputs] + Pairlist output(s) Returns ------- @@ -159,6 +159,11 @@ def _model_specific_input_preparation( number_of_atoms = data.atomic_numbers.shape[0] + # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter + # e.g. "maximum_interaction_radius" + + pairlist_output = pairlist_output["maximum_interaction_radius"] + nnp_input = SAKENeuralNetworkInput( pair_indices=pairlist_output.pair_indices, number_of_atoms=number_of_atoms, diff --git a/modelforge/potential/schnet.py b/modelforge/potential/schnet.py index 2640b456..9155832f 100644 --- a/modelforge/potential/schnet.py +++ b/modelforge/potential/schnet.py @@ -93,7 +93,7 @@ def __init__( number_of_radial_basis_functions, featurization_config=featurization_config, ) - # Intialize interaction blocks + # Initialize interaction blocks if shared_interactions: self.interaction_modules = nn.ModuleList( [ @@ -134,7 +134,7 @@ def __init__( ) def _model_specific_input_preparation( - self, data: "NNPInput", pairlist_output: PairListOutputs + self, data: "NNPInput", pairlist_output: Dict[str, PairListOutputs] ) -> SchnetNeuralNetworkData: """ Prepare the input data for the SchNet model. @@ -143,8 +143,8 @@ def _model_specific_input_preparation( ---------- data : NNPInput The input data for the model. - pairlist_output : PairListOutputs - The pairlist output. + pairlist_output : Dict[str, PairListOutputs] + The pairlist output(s). Returns ------- @@ -153,6 +153,11 @@ def _model_specific_input_preparation( """ number_of_atoms = data.atomic_numbers.shape[0] + # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter + # e.g. "maximum_interaction_radius" + + pairlist_output = pairlist_output["maximum_interaction_radius"] + nnp_input = SchnetNeuralNetworkData( pair_indices=pairlist_output.pair_indices, d_ij=pairlist_output.d_ij, diff --git a/modelforge/potential/tensornet.py b/modelforge/potential/tensornet.py index fb03985d..3e9079ad 100644 --- a/modelforge/potential/tensornet.py +++ b/modelforge/potential/tensornet.py @@ -367,7 +367,7 @@ def compute_properties( } def _model_specific_input_preparation( - self, data: "NNPInput", pairlist_output: "PairListOutputs" + self, data: NNPInput, pairlist_output: Dict[str, PairListOutputs] ) -> TensorNetNeuralNetworkData: """ Prepare the input data for the TensorNet model. @@ -376,8 +376,8 @@ def _model_specific_input_preparation( ---------- data : NNPInput The input data for the model. - pairlist_output : PairListOutputs - The pairlist output. + pairlist_output : Dict[str, PairListOutputs] + The pairlist output(s) Returns ------- @@ -386,6 +386,11 @@ def _model_specific_input_preparation( """ number_of_atoms = data.atomic_numbers.shape[0] + # Note, pairlist_output is a Dict where the key corresponds to the name of the cutoff parameter + # e.g. "maximum_interaction_radius" + + pairlist_output = pairlist_output["maximum_interaction_radius"] + nnpdata = TensorNetNeuralNetworkData( pair_indices=pairlist_output.pair_indices, d_ij=pairlist_output.d_ij, diff --git a/modelforge/tests/test_models.py b/modelforge/tests/test_models.py index 9709c6b0..1faecbf6 100644 --- a/modelforge/tests/test_models.py +++ b/modelforge/tests/test_models.py @@ -660,8 +660,8 @@ def test_pairlist(): from openff.units import unit cutoff = 5.0 * unit.nanometer # no relevant cutoff - pairlist = Neighborlist(cutoff, only_unique_pairs=True) - r = pairlist(positions, atomic_subsystem_indices) + pairlist = Neighborlist({"cutoff1": cutoff}, only_unique_pairs=True) + r = pairlist(positions, atomic_subsystem_indices)["cutoff1"] pair_indices = r.pair_indices # pairlist describes the pairs of interacting atoms within a batch @@ -690,8 +690,8 @@ def test_pairlist(): # test with cutoff cutoff = 2.0 * unit.nanometer - pairlist = Neighborlist(cutoff, only_unique_pairs=True) - r = pairlist(positions, atomic_subsystem_indices) + pairlist = Neighborlist({"cutoff1": cutoff}, only_unique_pairs=True) + r = pairlist(positions, atomic_subsystem_indices)["cutoff1"] pair_indices = r.pair_indices assert torch.equal(pair_indices, torch.tensor([[0, 1, 3, 4], [1, 2, 4, 5]])) @@ -714,8 +714,8 @@ def test_pairlist(): # test with complete pairlist cutoff = 2.0 * unit.nanometer - pairlist = Neighborlist(cutoff, only_unique_pairs=False) - r = pairlist(positions, atomic_subsystem_indices) + pairlist = Neighborlist({"cutoff1": cutoff}, only_unique_pairs=False) + r = pairlist(positions, atomic_subsystem_indices)["cutoff1"] pair_indices = r.pair_indices print(pair_indices, flush=True) @@ -726,11 +726,13 @@ def test_pairlist(): # make sure that Pairlist and Neighborlist behave the same for large cutoffs cutoff = 10.0 * unit.nanometer only_unique_pairs = False - neighborlist = Neighborlist(cutoff, only_unique_pairs=only_unique_pairs) + neighborlist = Neighborlist( + {"cutoff1": cutoff}, only_unique_pairs=only_unique_pairs + ) pairlist = Pairlist(only_unique_pairs=only_unique_pairs) r = pairlist(positions, atomic_subsystem_indices) pair_indices = r.pair_indices - r = neighborlist(positions, atomic_subsystem_indices) + r = neighborlist(positions, atomic_subsystem_indices)["cutoff1"] neighbor_indices = r.pair_indices assert torch.equal(pair_indices, neighbor_indices) @@ -738,11 +740,13 @@ def test_pairlist(): # make sure that they are the same also for non-redundant pairs cutoff = 10.0 * unit.nanometer only_unique_pairs = True - neighborlist = Neighborlist(cutoff, only_unique_pairs=only_unique_pairs) + neighborlist = Neighborlist( + {"cutoff1": cutoff}, only_unique_pairs=only_unique_pairs + ) pairlist = Pairlist(only_unique_pairs=only_unique_pairs) r = pairlist(positions, atomic_subsystem_indices) pair_indices = r.pair_indices - r = neighborlist(positions, atomic_subsystem_indices) + r = neighborlist(positions, atomic_subsystem_indices)["cutoff1"] neighbor_indices = r.pair_indices assert torch.equal(pair_indices, neighbor_indices) @@ -750,16 +754,59 @@ def test_pairlist(): # this should fail cutoff = 2.0 * unit.nanometer only_unique_pairs = True - neighborlist = Neighborlist(cutoff, only_unique_pairs=only_unique_pairs) + neighborlist = Neighborlist( + {"cutoff1": cutoff}, only_unique_pairs=only_unique_pairs + ) pairlist = Pairlist(only_unique_pairs=only_unique_pairs) r = pairlist(positions, atomic_subsystem_indices) pair_indices = r.pair_indices - r = neighborlist(positions, atomic_subsystem_indices) + r = neighborlist(positions, atomic_subsystem_indices)["cutoff1"] neighbor_indices = r.pair_indices assert not pair_indices.shape == neighbor_indices.shape +def test_multiple_neighborlists(): + from modelforge.potential.models import Pairlist, Neighborlist + import torch + from openff.units import unit + + atomic_subsystem_indices = torch.tensor([0, 0, 0, 0, 0]) + + positions = torch.tensor( + [ + [0.0, 0.0, 0.0], + [1.0, 0.0, 0.0], + [2.0, 0.0, 0.0], + [3.0, 0.0, 0.0], + [4.0, 0.0, 0.0], + ] + ) + + cutoff_short = 1.5 * unit.nanometer + cutoff_medium = 2.5 * unit.nanometer + cutoff_long = 3.5 * unit.nanometer + pairlist = Neighborlist( + {"short": cutoff_short, "medium": cutoff_medium, "long": cutoff_long}, + only_unique_pairs=True, + ) + r = pairlist(positions, atomic_subsystem_indices) + + assert torch.equal( + r["short"].pair_indices, torch.tensor([[0, 1, 2, 3], [1, 2, 3, 4]]) + ) + + assert torch.equal( + r["medium"].pair_indices, + torch.tensor([[0, 0, 1, 1, 2, 2, 3], [1, 2, 2, 3, 3, 4, 4]]), + ) + + assert torch.equal( + r["long"].pair_indices, + torch.tensor([[0, 0, 0, 1, 1, 1, 2, 2, 3], [1, 2, 3, 2, 3, 4, 3, 4, 4]]), + ) + + def test_pairlist_precomputation(): from modelforge.potential.models import Pairlist import torch @@ -1113,7 +1160,7 @@ def test_pairlist_calculate_r_ij_and_d_ij(): # Create Pairlist instance # --------------------------- # # Only unique pairs - pairlist = Neighborlist(cutoff, only_unique_pairs=True) + pairlist = Neighborlist({"cutoff_1": cutoff}, only_unique_pairs=True) pair_indices = pairlist.enumerate_all_pairs(atomic_subsystem_indices) # Calculate r_ij and d_ij @@ -1135,7 +1182,7 @@ def test_pairlist_calculate_r_ij_and_d_ij(): # --------------------------- # # ALL pairs - pairlist = Neighborlist(cutoff, only_unique_pairs=False) + pairlist = Neighborlist({"cutoff_1": cutoff}, only_unique_pairs=False) pair_indices = pairlist.enumerate_all_pairs(atomic_subsystem_indices) # Calculate r_ij and d_ij From f2d642b6281e4d971fd07746970f2c0a9d69ca88 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Mon, 19 Aug 2024 14:15:28 -0700 Subject: [PATCH 07/66] updated tensor net test --- modelforge/tests/test_tensornet.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/modelforge/tests/test_tensornet.py b/modelforge/tests/test_tensornet.py index c7a9fd4e..92ed271c 100644 --- a/modelforge/tests/test_tensornet.py +++ b/modelforge/tests/test_tensornet.py @@ -106,7 +106,9 @@ def test_input(): ], ) tensornet.compute_interacting_pairs._input_checks(mf_input) - pairlist_output = tensornet.compute_interacting_pairs.prepare_inputs(mf_input) + pairlist_output = tensornet.compute_interacting_pairs.prepare_inputs(mf_input)[ + "maximum_interaction_radius" + ] # torchmd-net TensorNet if reference_data: From c87a75bf057e797947656ad899b8052c5f61b473 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 20 Aug 2024 14:25:29 +0200 Subject: [PATCH 08/66] add additional tests for loss functions, add additional scaling factor for force error --- modelforge/potential/processing.py | 2 +- modelforge/tests/test_training.py | 147 ++++++++++++++++++++--------- modelforge/train/training.py | 63 ++++--------- 3 files changed, 123 insertions(+), 89 deletions(-) diff --git a/modelforge/potential/processing.py b/modelforge/potential/processing.py index ee7da6a5..d3b753b2 100644 --- a/modelforge/potential/processing.py +++ b/modelforge/potential/processing.py @@ -302,7 +302,7 @@ def __init__( } self.atomic_self_energies = AtomicSelfEnergies(atomic_self_energies) - def forward(self, data: Dict[str, torch.Tensor]) -> torch.Tensor: + def forward(self, data: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: """ Calculates the molecular self energy. diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 2bc0f5bd..87c69fea 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -1,24 +1,26 @@ import os -import pytest - import platform +import pytest +import torch + ON_MACOS = platform.system() == "Darwin" IN_GITHUB_ACTIONS = os.getenv("GITHUB_ACTIONS") == "true" -from modelforge.potential import _Implemented_NNPs -from modelforge.potential import NeuralNetworkPotentialFactory +from modelforge.potential import NeuralNetworkPotentialFactory, _Implemented_NNPs def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): + from importlib import resources + + import toml + from modelforge.tests.data import ( - potential_defaults, - training_defaults, dataset_defaults, + potential_defaults, runtime_defaults, + training_defaults, ) - from importlib import resources - import toml potential_path = ( resources.files(potential_defaults) / f"{potential_name.lower()}.toml" @@ -42,7 +44,7 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): potential_parameters = PotentialParameters(**potential_config_dict["potential"]) from modelforge.dataset.dataset import DatasetParameters - from modelforge.train.parameters import TrainingParameters, RuntimeParameters + from modelforge.train.parameters import RuntimeParameters, TrainingParameters dataset_parameters = DatasetParameters(**dataset_config_dict["dataset"]) training_parameters = TrainingParameters(**training_config_dict["training"]) @@ -56,17 +58,7 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): } -@pytest.mark.skipif(ON_MACOS, reason="Skipping this test on MacOS GitHub Actions") -@pytest.mark.parametrize( - "potential_name", _Implemented_NNPs.get_all_neural_network_names() -) -@pytest.mark.parametrize("dataset_name", ["QM9"]) -def test_train_with_lightning(potential_name, dataset_name): - """ - Test the forward pass for a given model and dataset. - """ - - # read default parameters +def get_trainer(potential_name: str, dataset_name: str): config = load_configs_into_pydantic_models(potential_name, dataset_name) # Extract parameters @@ -75,44 +67,44 @@ def test_train_with_lightning(potential_name, dataset_name): dataset_parameter = config["dataset"] runtime_parameter = config["runtime"] - from modelforge.potential.models import ( - NeuralNetworkPotentialFactory, - ) - - trainer = ( - NeuralNetworkPotentialFactory.generate_potential( - use="training", - potential_parameter=potential_parameter, - training_parameter=training_parameter, - dataset_parameter=dataset_parameter, - runtime_parameter=runtime_parameter, - ) - .train_potential() - .save_checkpoint("test.chp") # save checkpoint - ) - # continue training - NeuralNetworkPotentialFactory.generate_potential( + return NeuralNetworkPotentialFactory.generate_potential( use="training", potential_parameter=potential_parameter, training_parameter=training_parameter, dataset_parameter=dataset_parameter, runtime_parameter=runtime_parameter, - ).train_potential() + ) + + +@pytest.mark.skipif(ON_MACOS, reason="Skipping this test on MacOS GitHub Actions") +@pytest.mark.parametrize( + "potential_name", _Implemented_NNPs.get_all_neural_network_names() +) +@pytest.mark.parametrize("dataset_name", ["QM9"]) +def test_train_with_lightning(potential_name, dataset_name): + """ + Test that we can train, save and load checkpoints. + """ + + get_trainer(potential_name, dataset_name).train_potential().save_checkpoint( + "test.chp" + ) # save checkpoint + + # continue training from checkpoint + get_trainer(potential_name, dataset_name).train_potential() def test_train_from_single_toml_file(): - from modelforge.train.training import read_config_and_train - from modelforge.tests import data from importlib import resources + from modelforge.tests import data + from modelforge.train.training import read_config_and_train + config_path = resources.files(data) / f"config.toml" read_config_and_train(config_path) -import torch - - def test_error_calculation(single_batch_with_batchsize_16_with_force): # test the different Loss classes from modelforge.train.training import ( @@ -162,11 +154,78 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force): ) reference_F_error = torch.mean( - per_mol_error / data.metadata.atomic_subsystem_counts.unsqueeze(1) + per_mol_error / (3 * data.metadata.atomic_subsystem_counts.unsqueeze(1)) ) assert torch.allclose(F_error, reference_F_error) +def test_loss(single_batch_with_batchsize_16_with_force): + from modelforge.train.training import Loss + + batch = single_batch_with_batchsize_16_with_force + + loss_porperty = ["per_molecule_energy", "per_atom_force"] + loss_weights = {"per_molecule_energy": 0.5, "per_atom_force": 0.5} + loss = Loss(loss_porperty, loss_weights) + assert loss is not None + + # get trainer + trainer = get_trainer("schnet", "QM9") + prediction = trainer.model.calculate_predictions(batch, trainer.model.potential) + + # pass prediction through loss module + loss_output = loss(prediction, batch) + # let's recalculate the loss (NOTE: we scale the loss by the number of atoms) + + # --------------------------------------------- # + # first, calculate E_loss + E_loss = torch.mean( + ( + ( + prediction["per_molecule_energy_predict"] + - prediction["per_molecule_energy_true"] + ).pow(2) + ) + / batch.metadata.atomic_subsystem_counts.unsqueeze(1) + ) + assert torch.allclose(loss_output["per_molecule_energy/mse"], E_loss) + + # --------------------------------------------- # + # now calculate F_loss + per_atom_force_squared_error = ( + (prediction["per_atom_force_predict"] - prediction["per_atom_force_true"]) + .pow(2) + .sum(dim=1, keepdim=True) + ).squeeze(-1) + + # # Aggregate error per molecule + per_molecule_squared_error = torch.zeros_like( + batch.metadata.E.squeeze(-1), dtype=per_atom_force_squared_error.dtype + ) + per_molecule_squared_error.scatter_add_( + 0, + batch.nnp_input.atomic_subsystem_indices.long(), + per_atom_force_squared_error, + ) + # divide by number of atoms + per_molecule_squared_error = per_molecule_squared_error / ( + 3 * batch.metadata.atomic_subsystem_counts + ) + + per_atom_force_mse = torch.mean(per_molecule_squared_error) + assert torch.allclose(loss_output["per_atom_force/mse"], per_atom_force_mse) + + # --------------------------------------------- # + # let's double check that the loss is calculated correctly + # calculate the total loss + + assert torch.allclose( + loss_weights["per_molecule_energy"] * loss_output["per_molecule_energy/mse"] + + loss_weights["per_atom_force"] * loss_output["per_atom_force/mse"], + loss_output["total_loss"].to(torch.float32), + ) + + @pytest.mark.skipif(IN_GITHUB_ACTIONS, reason="Skipping this test on GitHub Actions") @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 401e1c91..0a488b9a 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -38,13 +38,6 @@ class Error(nn.Module, ABC): """ Class representing the error calculation for predicted and true values. - - Methods: - calculate_error(predicted: torch.Tensor, true: torch.Tensor) -> torch.Tensor: - Calculates the error between the predicted and true values. - - scale_by_number_of_atoms(error, atomic_subsystem_counts) -> torch.Tensor: - Scales the error by the number of atoms in the atomic subsystems. """ @abstractmethod @@ -75,10 +68,13 @@ def calculate_squared_error( torch.Tensor The calculated error. """ - return (predicted_tensor - reference_tensor).pow(2).sum(dim=1, keepdim=True) + error = (predicted_tensor - reference_tensor).pow(2).sum(dim=1, keepdim=True) + return error @staticmethod - def scale_by_number_of_atoms(error, atomic_subsystem_counts) -> torch.Tensor: + def scale_by_number_of_atoms( + error, atomic_subsystem_counts, prefactor: int = 1 + ) -> torch.Tensor: """ Scales the error by the number of atoms in the atomic subsystems. @@ -88,15 +84,16 @@ def scale_by_number_of_atoms(error, atomic_subsystem_counts) -> torch.Tensor: The error to be scaled. atomic_subsystem_counts : torch.Tensor The number of atoms in the atomic subsystems. - + prefactor : int + To consider the shape of the property, e.g., if the reference property has shape (N,3) it is necessary to further devide the result by 3 Returns ------- torch.Tensor The scaled error. """ # divide by number of atoms - scaled_by_number_of_atoms = error / atomic_subsystem_counts.unsqueeze( - 1 + scaled_by_number_of_atoms = error / ( + prefactor * atomic_subsystem_counts.unsqueeze(1) ) # FIXME: ensure that all per-atom properties have dimension (N, 1) return scaled_by_number_of_atoms @@ -106,12 +103,6 @@ class FromPerAtomToPerMoleculeMeanSquaredError(Error): Calculates the per-atom error and aggregates it to per-molecule mean squared error. """ - def __init__(self): - """ - Initializes the PerAtomToPerMoleculeError class. - """ - super().__init__() - def calculate_error( self, per_atom_prediction: torch.Tensor, @@ -163,7 +154,9 @@ def forward( ) # divide by number of atoms per_molecule_square_error_scaled = self.scale_by_number_of_atoms( - per_molecule_squared_error, batch.metadata.atomic_subsystem_counts + per_molecule_squared_error, + batch.metadata.atomic_subsystem_counts, + prefactor=per_atom_prediction.shape[-1], ) # return the average return torch.mean(per_molecule_square_error_scaled) @@ -171,17 +164,10 @@ def forward( class PerMoleculeMeanSquaredError(Error): """ - Calculates the per-molecule mean squared error. - + Calculates the per-molecule mean squared error. Note that the + error is divided by the number of atoms in the molecule, to remove any bias due to the number of atoms. """ - def __init__(self): - """ - Initializes the PerMoleculeMeanSquaredError class. - """ - - super().__init__() - def forward( self, per_molecule_prediction: torch.Tensor, @@ -210,7 +196,8 @@ def forward( per_molecule_prediction, per_molecule_reference ) per_molecule_square_error_scaled = self.scale_by_number_of_atoms( - per_molecule_squared_error, batch.metadata.atomic_subsystem_counts + per_molecule_squared_error, + batch.metadata.atomic_subsystem_counts, ) # return the average @@ -228,24 +215,12 @@ def calculate_error( class Loss(nn.Module): - """ - Calculates the combined loss for energy and force predictions. - - Attributes - ---------- - loss_property : List[str] - List of properties to include in the loss calculation. - weight : Dict[str, float] - Dictionary containing the weights for each property in the loss calculation. - loss : nn.ModuleDict - Module dictionary containing the loss functions for each property. - """ _SUPPORTED_PROPERTIES = ["per_molecule_energy", "per_atom_force"] def __init__(self, loss_porperty: List[str], weight: Dict[str, float]): """ - Initializes the Loss class. + Calculates the combined loss for energy and force predictions. Parameters ---------- @@ -302,11 +277,11 @@ def forward(self, predict_target: Dict[str, torch.Tensor], batch): # iterate over loss properties for prop in self.loss_property: # calculate loss per property - loss_ = self.weight[prop] * self.loss[prop]( + loss_ = self.loss[prop]( predict_target[f"{prop}_predict"], predict_target[f"{prop}_true"], batch ) # add total loss - loss = loss + loss_ + loss = loss + (self.weight[prop] * loss_) # save loss loss_dict[f"{prop}/mse"] = loss_ From 6cc7d3bd4340f0a3aebf7ef7dca957cb23d46120 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 20 Aug 2024 15:35:53 +0200 Subject: [PATCH 09/66] fix bug that resulted in force loss without gradient --- modelforge/tests/test_training.py | 4 ++++ modelforge/train/training.py | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 87c69fea..1d425999 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -176,6 +176,10 @@ def test_loss(single_batch_with_batchsize_16_with_force): # pass prediction through loss module loss_output = loss(prediction, batch) # let's recalculate the loss (NOTE: we scale the loss by the number of atoms) + # --------------------------------------------- # + # make sure that both have gradients + assert prediction["per_molecule_energy_predict"].requires_grad + assert prediction["per_atom_force_predict"].requires_grad # --------------------------------------------- # # first, calculate E_loss diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 0a488b9a..0ea70762 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -406,7 +406,7 @@ def _get_forces( grad = torch.autograd.grad( per_molecule_energy_predict.sum(), nnp_input.positions, - create_graph=False, + create_graph=True, retain_graph=True, )[0] per_atom_force_predict = -1 * grad # Forces are the negative gradient of energy From a1a1dad06f64b1c47f00827a9671b65908f91779 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 20 Aug 2024 17:10:26 +0200 Subject: [PATCH 10/66] remove the scaling by number of atoms parameter, will come back to this later --- modelforge/tests/test_training.py | 2 -- modelforge/train/training.py | 32 ++++++++++++++++++++++++++----- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 1d425999..3b514847 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -139,7 +139,6 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force): F_error = error(predicted_F, true_F, data) # compare error (mean squared error scaled by number of atoms in the molecule) - scaled_error = ( torch.linalg.vector_norm(predicted_F - true_F, dim=1, keepdim=True) ** 2 ) @@ -190,7 +189,6 @@ def test_loss(single_batch_with_batchsize_16_with_force): - prediction["per_molecule_energy_true"] ).pow(2) ) - / batch.metadata.atomic_subsystem_counts.unsqueeze(1) ) assert torch.allclose(loss_output["per_molecule_energy/mse"], E_loss) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 0ea70762..a530bed3 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -40,6 +40,15 @@ class Error(nn.Module, ABC): Class representing the error calculation for predicted and true values. """ + def __init__(self, scale_by_number_of_atoms: bool = True): + + super().__init__() + if not scale_by_number_of_atoms: + # If scaling is not desired, override the method to just return the input error unchanged + self.scale_by_number_of_atoms = ( + lambda error, atomic_subsystem_counts, prefactor=1: error + ) + @abstractmethod def calculate_error( self, predicted: torch.Tensor, true: torch.Tensor @@ -103,6 +112,9 @@ class FromPerAtomToPerMoleculeMeanSquaredError(Error): Calculates the per-atom error and aggregates it to per-molecule mean squared error. """ + def __init__(self, scale_by_number_of_atoms: bool = True): + super().__init__(scale_by_number_of_atoms) + def calculate_error( self, per_atom_prediction: torch.Tensor, @@ -164,10 +176,12 @@ def forward( class PerMoleculeMeanSquaredError(Error): """ - Calculates the per-molecule mean squared error. Note that the - error is divided by the number of atoms in the molecule, to remove any bias due to the number of atoms. + Calculates the per-molecule mean squared error. """ + def __init__(self, scale_by_number_of_atoms: bool = True): + super().__init__(scale_by_number_of_atoms) + def forward( self, per_molecule_prediction: torch.Tensor, @@ -216,7 +230,7 @@ def calculate_error( class Loss(nn.Module): - _SUPPORTED_PROPERTIES = ["per_molecule_energy", "per_atom_force"] + _SUPPORTED_PROPERTIES = ["per_atom_energy", "per_molecule_energy", "per_atom_force"] def __init__(self, loss_porperty: List[str], weight: Dict[str, float]): """ @@ -245,9 +259,17 @@ def __init__(self, loss_porperty: List[str], weight: Dict[str, float]): for prop, w in weight.items(): if prop in self._SUPPORTED_PROPERTIES: if prop == "per_atom_force": - self.loss[prop] = FromPerAtomToPerMoleculeMeanSquaredError() + self.loss[prop] = FromPerAtomToPerMoleculeMeanSquaredError( + scale_by_number_of_atoms=True + ) + elif prop == "per_atom_energy": + self.loss[prop] = PerMoleculeMeanSquaredError( + scale_by_number_of_atoms=True + ) # FIXME: this is currently not working elif prop == "per_molecule_energy": - self.loss[prop] = PerMoleculeMeanSquaredError() + self.loss[prop] = PerMoleculeMeanSquaredError( + scale_by_number_of_atoms=False + ) self.register_buffer(prop, torch.tensor(w)) else: raise NotImplementedError(f"Loss type {prop} not implemented.") From 8e97062336343f1e36f5a1fb376592a08204b7fb Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 20 Aug 2024 17:11:43 +0200 Subject: [PATCH 11/66] formatinng fix --- modelforge/train/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index a530bed3..53f5df4b 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -176,7 +176,7 @@ def forward( class PerMoleculeMeanSquaredError(Error): """ - Calculates the per-molecule mean squared error. + Calculates the per-molecule mean squared error. """ def __init__(self, scale_by_number_of_atoms: bool = True): From 389ff8f614606766a8683039012cfc23bb4d9c01 Mon Sep 17 00:00:00 2001 From: wiederm Date: Wed, 21 Aug 2024 13:21:10 +0200 Subject: [PATCH 12/66] small modifications --- modelforge/tests/test_dataset.py | 3 ++- modelforge/train/training.py | 10 ++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py index 7702892b..e2120562 100644 --- a/modelforge/tests/test_dataset.py +++ b/modelforge/tests/test_dataset.py @@ -713,7 +713,8 @@ def test_numpy_dataset_assignment(dataset_name): def test_energy_postprocessing(): - # setup test dataset + # test that the mean and stddev of the dataset + # are correct from modelforge.dataset.dataset import DataModule # test the self energy calculation on the QM9 dataset diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 53f5df4b..ebcce9db 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -651,12 +651,12 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: for key, loss in loss_dict.items(): self.log( f"loss/{key}", - torch.mean(loss), + loss, on_step=False, prog_bar=True, on_epoch=True, - batch_size=1, - ) # batch size is 1 because the mean of the batch is logged + sync_dist=True, + ) return loss_dict["total_loss"] @@ -790,7 +790,9 @@ def _log_on_epoch(self, log_mode: str = "train"): metrics[name] = metric.compute() metric.reset() # log dict, print val metrics to console - self.log_dict(metrics, on_epoch=True, prog_bar=(phase == "val")) + self.log_dict( + metrics, on_epoch=True, prog_bar=(phase == "val"), sync_dist=True + ) def configure_optimizers(self): """ From 5b75fed7feb17d0a30361db86515a0af599c7614 Mon Sep 17 00:00:00 2001 From: wiederm Date: Wed, 21 Aug 2024 14:19:48 +0200 Subject: [PATCH 13/66] add batchsize --- modelforge/dataset/dataset.py | 3 +++ modelforge/tests/test_training.py | 8 +++---- modelforge/train/training.py | 38 +++++++++++++++++-------------- 3 files changed, 28 insertions(+), 21 deletions(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index 53142d86..c7b4d326 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -238,6 +238,9 @@ def to( return self + def batch_size(self): + return self.metadata.E.size(dim=0) + class TorchDataset(torch.utils.data.Dataset[BatchData]): """ Wraps a numpy dataset to make it compatible with PyTorch DataLoader. diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 3b514847..76847915 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -108,8 +108,8 @@ def test_train_from_single_toml_file(): def test_error_calculation(single_batch_with_batchsize_16_with_force): # test the different Loss classes from modelforge.train.training import ( - FromPerAtomToPerMoleculeMeanSquaredError, - PerMoleculeMeanSquaredError, + FromPerAtomToPerMoleculeSquaredError, + PerMoleculeSquaredError, ) # generate data @@ -122,7 +122,7 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force): predicted_F = true_F + torch.rand_like(true_F) * 10 # test error for property with shape (nr_of_molecules, 1) - error = PerMoleculeMeanSquaredError() + error = PerMoleculeSquaredError() E_error = error(predicted_E, true_E, data) # compare output (mean squared error scaled by number of atoms in the molecule) @@ -135,7 +135,7 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force): assert torch.allclose(E_error, reference_E_error) # test error for property with shape (nr_of_atoms, 3) - error = FromPerAtomToPerMoleculeMeanSquaredError() + error = FromPerAtomToPerMoleculeSquaredError() F_error = error(predicted_F, true_F, data) # compare error (mean squared error scaled by number of atoms in the molecule) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index ebcce9db..97057941 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -25,10 +25,10 @@ __all__ = [ "Error", - "FromPerAtomToPerMoleculeMeanSquaredError", + "FromPerAtomToPerMoleculeSquaredError", "Loss", "LossFactory", - "PerMoleculeMeanSquaredError", + "PerMoleculeSquaredError", "ModelTrainer", "create_error_metrics", "ModelTrainer", @@ -107,7 +107,7 @@ def scale_by_number_of_atoms( return scaled_by_number_of_atoms -class FromPerAtomToPerMoleculeMeanSquaredError(Error): +class FromPerAtomToPerMoleculeSquaredError(Error): """ Calculates the per-atom error and aggregates it to per-molecule mean squared error. """ @@ -170,11 +170,11 @@ def forward( batch.metadata.atomic_subsystem_counts, prefactor=per_atom_prediction.shape[-1], ) - # return the average - return torch.mean(per_molecule_square_error_scaled) + return per_molecule_square_error_scaled -class PerMoleculeMeanSquaredError(Error): + +class PerMoleculeSquaredError(Error): """ Calculates the per-molecule mean squared error. """ @@ -214,8 +214,7 @@ def forward( batch.metadata.atomic_subsystem_counts, ) - # return the average - return torch.mean(per_molecule_square_error_scaled) + return per_molecule_square_error_scaled def calculate_error( self, @@ -259,15 +258,15 @@ def __init__(self, loss_porperty: List[str], weight: Dict[str, float]): for prop, w in weight.items(): if prop in self._SUPPORTED_PROPERTIES: if prop == "per_atom_force": - self.loss[prop] = FromPerAtomToPerMoleculeMeanSquaredError( + self.loss[prop] = FromPerAtomToPerMoleculeSquaredError( scale_by_number_of_atoms=True ) elif prop == "per_atom_energy": - self.loss[prop] = PerMoleculeMeanSquaredError( + self.loss[prop] = PerMoleculeSquaredError( scale_by_number_of_atoms=True ) # FIXME: this is currently not working elif prop == "per_molecule_energy": - self.loss[prop] = PerMoleculeMeanSquaredError( + self.loss[prop] = PerMoleculeSquaredError( scale_by_number_of_atoms=False ) self.register_buffer(prop, torch.tensor(w)) @@ -638,27 +637,32 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: The loss tensor computed for the current training step. """ - # calculate energy and forces + # calculate energy and forces, Note that `predict_target` is a + # dictionary containing the predicted and true values for energy and + # force` predict_target = self.calculate_predictions(batch, self.potential) - # calculate the loss + # calculate the loss (for every entry in predict_target the squared + # error is calculated) loss_dict = self.loss(predict_target, batch) - # Update and log training error - self._update_metrics(self.train_error, predict_target) + # Update and log training error (if requested) + if self.log_on_training_step: + self._update_metrics(self.train_error, predict_target) # log the loss (this includes the individual contributions that the loss contains) for key, loss in loss_dict.items(): self.log( f"loss/{key}", - loss, + torch.mean(loss), on_step=False, prog_bar=True, on_epoch=True, sync_dist=True, + batch_size=batch.batch_size(), ) - return loss_dict["total_loss"] + return torch.mean(loss_dict["total_loss"]) @torch.enable_grad() def validation_step(self, batch: "BatchData", batch_idx: int) -> None: From 0a40740f6be5b778817abe9295c3daa60afb34cb Mon Sep 17 00:00:00 2001 From: Ian Kenney Date: Wed, 21 Aug 2024 09:55:01 -0700 Subject: [PATCH 14/66] Include dataset/yaml_files/*.yaml in MANIFEST.in --- MANIFEST.in | 2 ++ 1 file changed, 2 insertions(+) diff --git a/MANIFEST.in b/MANIFEST.in index e0267afd..958b9bef 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,3 +1,5 @@ include CODE_OF_CONDUCT.md +include modelforge/dataset/yaml_files/*.yaml + global-exclude *.py[cod] __pycache__ *.so From 98f88b05c438b1be9692559d55c5d013b7beac6c Mon Sep 17 00:00:00 2001 From: Ian Kenney Date: Wed, 21 Aug 2024 13:43:22 -0700 Subject: [PATCH 15/66] Remove setup.py and include more files in MANIFEST --- MANIFEST.in | 3 +++ pyproject.toml | 2 +- setup.py | 21 --------------------- 3 files changed, 4 insertions(+), 22 deletions(-) delete mode 100644 setup.py diff --git a/MANIFEST.in b/MANIFEST.in index 958b9bef..54419180 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -1,5 +1,8 @@ include CODE_OF_CONDUCT.md include modelforge/dataset/yaml_files/*.yaml +include modelforge/curation/yaml_files/*.yaml +include modelforge/tests/data/potential_defaults/*.toml +include modelforge/tests/data/training_defaults/*.toml global-exclude *.py[cod] __pycache__ *.so diff --git a/pyproject.toml b/pyproject.toml index ed8b70a9..b6ca3463 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ zip-safe = false # (instead of using 'find'). With 'find', the 'tests' subpackage is discovered # recursively because of its __init__.py file, but the data subdirectory is excluded # with include-package-data = false and namespaces = false. -include-package-data = false +include-package-data = true [tool.setuptools.packages.find] namespaces = false where = ["."] diff --git a/setup.py b/setup.py deleted file mode 100644 index c7e538b2..00000000 --- a/setup.py +++ /dev/null @@ -1,21 +0,0 @@ -from setuptools import setup - -setup( - name="modelforge", - version="0.1", - packages=["modelforge"], - package_data={ - "modelforge": [ - "dataset/yaml_files/*", - "curation/yaml_files/*", - "tests/data/potential_defaults/*", - "tests/data/training_defaults/*", - ] - }, - url="https://github.com/choderalab/modelforge", - license="MIT", - author="Chodera lab, Marcus Wieder, Christopher Iacovella, and others", - author_email="", - description="A library for building and training neural network potentials", - include_package_data=True, -) From 2876384ccbb663306dc609fbb58590dfce1179ff Mon Sep 17 00:00:00 2001 From: wiederm Date: Thu, 22 Aug 2024 16:13:50 +0200 Subject: [PATCH 16/66] update --- modelforge/dataset/dataset.py | 2 +- modelforge/tests/conftest.py | 28 ++-------- .../tests/data/training_defaults/default.toml | 6 +- modelforge/tests/test_dataset.py | 5 +- modelforge/tests/test_models.py | 55 ++++++++++++------- modelforge/tests/test_nn.py | 8 ++- modelforge/tests/test_painn.py | 11 ++-- modelforge/tests/test_physnet.py | 6 +- modelforge/tests/test_sake.py | 21 ++++--- modelforge/tests/test_training.py | 12 ++-- modelforge/train/training.py | 49 +++++++++++------ scripts/config.toml | 21 ++++--- 12 files changed, 125 insertions(+), 99 deletions(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index c7b4d326..440c867b 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -1343,7 +1343,7 @@ def _per_datapoint_operations( from tqdm import tqdm # remove the self energies if requested - log.info("Precalculating pairlist for dataset") + log.info("Performing per datapoint operations in the dataset dataset") if self.remove_self_energies: log.info("Removing self energies from the dataset") diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py index 7e8e475f..f7ccd939 100644 --- a/modelforge/tests/conftest.py +++ b/modelforge/tests/conftest.py @@ -96,35 +96,15 @@ def single_batch(batch_size: int = 64, dataset_name="QM9"): @pytest.fixture(scope="session") -def single_batch_with_batchsize_64(): +def single_batch_with_batchsize(): """ Utility fixture to create a single batch of data for testing. """ - return single_batch(batch_size=64) + def _create_single_batch(batch_size: int, dataset_name: str): + return single_batch(batch_size=batch_size, dataset_name=dataset_name) -@pytest.fixture(scope="session") -def single_batch_with_batchsize_1(): - """ - Utility fixture to create a single batch of data for testing. - """ - return single_batch(batch_size=1) - - -@pytest.fixture(scope="session") -def single_batch_with_batchsize_2_with_force(): - """ - Utility fixture to create a single batch of data for testing. - """ - return single_batch(batch_size=2, dataset_name="PHALKETHOH") - - -@pytest.fixture(scope="session") -def single_batch_with_batchsize_16_with_force(): - """ - Utility fixture to create a single batch of data for testing. - """ - return single_batch(batch_size=16, dataset_name="PHALKETHOH") + return _create_single_batch def initialize_dataset( diff --git a/modelforge/tests/data/training_defaults/default.toml b/modelforge/tests/data/training_defaults/default.toml index 19c0d3b7..1e4025a9 100644 --- a/modelforge/tests/data/training_defaults/default.toml +++ b/modelforge/tests/data/training_defaults/default.toml @@ -35,11 +35,11 @@ monitor = "val/per_molecule_energy/rmse" interval = "epoch" [training.loss_parameter] -loss_property = ['per_molecule_energy', 'per_atom_force'] # use +loss_property = ['per_molecule_energy'] #, 'per_atom_force'] # use [training.loss_parameter.weight] -per_molecule_energy = 0.999 #NOTE: reciprocal units -per_atom_force = 0.001 +per_molecule_energy = 1.0 #NOTE: reciprocal units +#per_atom_force = 0.001 [training.early_stopping] diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py index e2120562..b591fa30 100644 --- a/modelforge/tests/test_dataset.py +++ b/modelforge/tests/test_dataset.py @@ -458,10 +458,11 @@ def test_data_item_format_of_datamodule( @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) -def test_dataset_neighborlist(potential_name, single_batch_with_batchsize_64): +def test_dataset_neighborlist(potential_name, single_batch_with_batchsize): """Test the neighborlist.""" - nnp_input = single_batch_with_batchsize_64.nnp_input + batch = single_batch_with_batchsize(64, 'QM9') + nnp_input = batch.nnp_input # test that the neighborlist is correctly generated from modelforge.tests.test_models import load_configs_into_pydantic_models diff --git a/modelforge/tests/test_models.py b/modelforge/tests/test_models.py index 9709c6b0..58602e31 100644 --- a/modelforge/tests/test_models.py +++ b/modelforge/tests/test_models.py @@ -54,11 +54,13 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) -def test_JAX_wrapping(potential_name, single_batch_with_batchsize_64): +def test_JAX_wrapping(potential_name, single_batch_with_batchsize): from modelforge.potential.models import ( NeuralNetworkPotentialFactory, ) + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + # read default parameters config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9") @@ -70,7 +72,7 @@ def test_JAX_wrapping(potential_name, single_batch_with_batchsize_64): ) assert "JAX" in str(type(model)) - nnp_input = single_batch_with_batchsize_64.nnp_input.as_jax_namedtuple() + nnp_input = batch.nnp_input.as_jax_namedtuple() out = model(nnp_input)["per_molecule_energy"] import jax @@ -329,13 +331,14 @@ def test_dataset_statistic(potential_name): "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) def test_energy_between_simulation_environments( - potential_name, single_batch_with_batchsize_64 + potential_name, single_batch_with_batchsize ): # compare that the energy is the same for the JAX and PyTorch Model import numpy as np import torch - nnp_input = single_batch_with_batchsize_64.nnp_input + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + nnp_input = batch.nnp_input # test the forward pass through each of the models # cast input and model to torch.float64 # read default parameters @@ -416,20 +419,24 @@ def test_forward_pass_with_all_datasets( assert torch.all(pair_list[0, 1:] >= pair_list[0, :-1]) +@pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"]) @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) @pytest.mark.parametrize("simulation_environment", ["JAX", "PyTorch"]) def test_forward_pass( - potential_name, simulation_environment, single_batch_with_batchsize_64 + dataset_name, potential_name, simulation_environment, single_batch_with_batchsize ): # this test sends a single batch from different datasets through the model import torch - nnp_input = single_batch_with_batchsize_64.nnp_input + batch = batch = single_batch_with_batchsize(batch_size=6, dataset_name=dataset_name) + nnp_input = batch.nnp_input # read default parameters - config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9") + config = load_configs_into_pydantic_models( + f"{potential_name.lower()}", dataset_name + ) nr_of_mols = nnp_input.atomic_subsystem_indices.unique().shape[0] # test the forward pass through each of the models @@ -443,6 +450,7 @@ def test_forward_pass( output = model(nnp_input) + # test that we get an energie per molecule assert len(output["per_molecule_energy"]) == nr_of_mols @@ -450,8 +458,13 @@ def test_forward_pass( # which have chemically equivalent hydrogens at the minimum geometry. # This has to be reflected in the atomic energies E_i, which # have to be equal for all hydrogens - if "JAX" not in str(type(model)): - from loguru import logger as log + if "JAX" not in str(type(model)) and dataset_name == "QM9": + # make sure that we are correctly reducing + ref = torch.zeros_like(output["per_molecule_energy"]).scatter_add_( + 0, nnp_input.atomic_subsystem_indices.long(), output["per_atom_energy"] + ) + + assert torch.allclose(ref, output["per_molecule_energy"]) # assert that the following tensor has equal values for dim=0 index 1 to 4 and 6 to 8 @@ -478,17 +491,18 @@ def test_forward_pass( @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) -def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsize_64): +def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsize): """ Test the calculation of energies and forces for a molecule. """ import torch + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") # read default parameters config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9") # get batch - nnp_input = single_batch_with_batchsize_64.nnp_input + nnp_input = batch.nnp_input # test the pass through each of the models model_training = NeuralNetworkPotentialFactory.generate_potential( @@ -536,7 +550,7 @@ def test_calculate_energies_and_forces(potential_name, single_batch_with_batchsi "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) def test_calculate_energies_and_forces_with_jax( - potential_name, single_batch_with_batchsize_64 + potential_name, single_batch_with_batchsize ): """ Test the calculation of energies and forces for a molecule. @@ -545,8 +559,8 @@ def test_calculate_energies_and_forces_with_jax( # read default parameters config = load_configs_into_pydantic_models(f"{potential_name.lower()}", "qm9") - - nnp_input = single_batch_with_batchsize_64.nnp_input + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + nnp_input = batch.nnp_input # test the backward pass through each of the models nr_of_mols = nnp_input.atomic_subsystem_indices.unique().shape[0] nr_of_atoms_per_batch = nnp_input.atomic_subsystem_indices.shape[0] @@ -929,11 +943,11 @@ def test_pairlist_on_dataset(): @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) -def test_casting(potential_name, single_batch_with_batchsize_64): +def test_casting(potential_name, single_batch_with_batchsize): # test dtype casting import torch - batch = single_batch_with_batchsize_64 + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") batch_ = batch.to(dtype=torch.float64) assert batch_.nnp_input.positions.dtype == torch.float64 batch_ = batch_.to(dtype=torch.float32) @@ -978,7 +992,7 @@ def test_casting(potential_name, single_batch_with_batchsize_64): def test_equivariant_energies_and_forces( potential_name, simulation_environment, - single_batch_with_batchsize_64, + single_batch_with_batchsize, equivariance_utils, ): """ @@ -1001,7 +1015,8 @@ def test_equivariant_energies_and_forces( translation, rotation, reflection = equivariance_utils # define the tolerance atol = 1e-3 - nnp_input = single_batch_with_batchsize_64.nnp_input + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + nnp_input = batch.nnp_input # initialize the models model = model.to(dtype=torch.float64) @@ -1009,7 +1024,9 @@ def test_equivariant_energies_and_forces( # ------------------- # # start the test # reference values - nnp_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float64) + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + + nnp_input = batch.nnp_input.to(dtype=torch.float64) reference_result = model(nnp_input)["per_molecule_energy"].to(dtype=torch.float64) reference_forces = -torch.autograd.grad( reference_result.sum(), diff --git a/modelforge/tests/test_nn.py b/modelforge/tests/test_nn.py index e399550a..a5b0b276 100644 --- a/modelforge/tests/test_nn.py +++ b/modelforge/tests/test_nn.py @@ -1,14 +1,16 @@ from .test_models import load_configs_into_pydantic_models -def test_embedding(single_batch_with_batchsize_64): +def test_embedding(single_batch_with_batchsize): # test the input featurization, including: # - nuclear charge embedding # - total charge mixing - import torch + import torch # noqa: F401 - nnp_input = single_batch_with_batchsize_64.nnp_input + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + + nnp_input = batch.nnp_input model_name = "SchNet" # read default parameters and extract featurization config = load_configs_into_pydantic_models(f"{model_name.lower()}", "qm9") diff --git a/modelforge/tests/test_painn.py b/modelforge/tests/test_painn.py index 647d9901..3dc60d48 100644 --- a/modelforge/tests/test_painn.py +++ b/modelforge/tests/test_painn.py @@ -2,7 +2,7 @@ from modelforge.potential.painn import PaiNN -def test_forward(single_batch_with_batchsize_64): +def test_forward(single_batch_with_batchsize): """Test initialization of the PaiNN neural network potential.""" # read default parameters from modelforge.tests.test_models import load_configs_into_pydantic_models @@ -17,8 +17,9 @@ def test_forward(single_batch_with_batchsize_64): ], ) assert painn is not None, "PaiNN model should be initialized." + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") - nnp_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float32) + nnp_input = batch.nnp_input.to(dtype=torch.float32) energy = painn(nnp_input)["per_molecule_energy"] nr_of_mols = nnp_input.atomic_subsystem_indices.unique().shape[0] @@ -27,11 +28,13 @@ def test_forward(single_batch_with_batchsize_64): ) # Assuming energy is calculated per sample in the batch -def test_equivariance(single_batch_with_batchsize_64): +def test_equivariance(single_batch_with_batchsize): from modelforge.potential.painn import PaiNN from dataclasses import replace import torch + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + from modelforge.tests.test_models import load_configs_into_pydantic_models # read default parameters @@ -50,7 +53,7 @@ def test_equivariance(single_batch_with_batchsize_64): ], ).double() - methane_input = single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float64) + methane_input = batch.nnp_input.to(dtype=torch.float64) perturbed_methane_input = replace(methane_input) perturbed_methane_input.positions = torch.matmul( methane_input.positions, rotation_matrix diff --git a/modelforge/tests/test_physnet.py b/modelforge/tests/test_physnet.py index 8aadca05..1601654d 100644 --- a/modelforge/tests/test_physnet.py +++ b/modelforge/tests/test_physnet.py @@ -15,7 +15,7 @@ def test_init(): ) -def test_forward(single_batch_with_batchsize_64): +def test_forward(single_batch_with_batchsize): import torch from modelforge.potential.physnet import PhysNet @@ -37,7 +37,9 @@ def test_forward(single_batch_with_batchsize_64): ) model = model.to(torch.float32) print(model) - yhat = model(single_batch_with_batchsize_64.nnp_input.to(dtype=torch.float32)) + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + + yhat = model(batch.nnp_input.to(dtype=torch.float32)) def test_compare_representation(): diff --git a/modelforge/tests/test_sake.py b/modelforge/tests/test_sake.py index 8fff65ba..b3f36ee1 100644 --- a/modelforge/tests/test_sake.py +++ b/modelforge/tests/test_sake.py @@ -33,12 +33,13 @@ def test_init(): from openff.units import unit -def test_forward(single_batch_with_batchsize_64): +def test_forward(single_batch_with_batchsize): """ Test the forward pass of the SAKE model. """ # get methane input - methane = single_batch_with_batchsize_64.nnp_input + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + methane = batch.nnp_input from modelforge.tests.test_models import load_configs_into_pydantic_models @@ -91,7 +92,7 @@ def test_interaction_forward(): @pytest.mark.parametrize("eq_atol", [3e-1]) @pytest.mark.parametrize("h_atol", [8e-2]) -def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize_64): +def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize): import torch from modelforge.potential.sake import SAKE from dataclasses import replace @@ -118,7 +119,9 @@ def test_layer_equivariance(h_atol, eq_atol, single_batch_with_batchsize_64): ) # get methane input - methane = single_batch_with_batchsize_64.nnp_input + batch = batch = single_batch_with_batchsize(batch_size=64, dataset_name="QM9") + + methane = batch.nnp_input perturbed_methane_input = replace(methane) perturbed_methane_input.positions = torch.matmul(methane.positions, rotation_matrix) @@ -424,7 +427,7 @@ def test_sake_layer_against_reference(include_self_pairs, v_is_none): # FIXME: this test is currently failing @pytest.mark.xfail -def test_model_against_reference(single_batch_with_batchsize_1): +def test_model_against_reference(single_batch_with_batchsize): nr_heads = 5 key = jax.random.PRNGKey(1884) torch.manual_seed(1884) @@ -462,7 +465,8 @@ def test_model_against_reference(single_batch_with_batchsize_1): ) # get methane input - methane = single_batch_with_batchsize_1.nnp_input + batch = single_batch_with_batchsize(batch_size=1) + methane = batch.nnp_input pairlist_output = mf_sake.compute_interacting_pairs.prepare_inputs(methane) prepared_methane = mf_sake.core_module._model_specific_input_preparation( methane, pairlist_output @@ -605,7 +609,7 @@ def test_model_against_reference(single_batch_with_batchsize_1): # assert torch.allclose(mf_out.E, torch.from_numpy(onp.array(ref_out[0]))) -def test_model_invariance(single_batch_with_batchsize_1): +def test_model_invariance(single_batch_with_batchsize): from dataclasses import replace from modelforge.tests.test_models import load_configs_into_pydantic_models @@ -620,7 +624,8 @@ def test_model_invariance(single_batch_with_batchsize_1): ], ) # get methane input - methane = single_batch_with_batchsize_1.nnp_input + batch = single_batch_with_batchsize(batch_size=1) + methane = batch.nnp_input rotation_matrix = torch.tensor([[0.0, 1.0, 0.0], [-1.0, 0.0, 0.0], [0.0, 0.0, 1.0]]) perturbed_methane_input = replace(methane) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 76847915..b50f6454 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -80,7 +80,7 @@ def get_trainer(potential_name: str, dataset_name: str): @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) -@pytest.mark.parametrize("dataset_name", ["QM9"]) +@pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"]) def test_train_with_lightning(potential_name, dataset_name): """ Test that we can train, save and load checkpoints. @@ -105,7 +105,7 @@ def test_train_from_single_toml_file(): read_config_and_train(config_path) -def test_error_calculation(single_batch_with_batchsize_16_with_force): +def test_error_calculation(single_batch_with_batchsize): # test the different Loss classes from modelforge.train.training import ( FromPerAtomToPerMoleculeSquaredError, @@ -113,7 +113,9 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force): ) # generate data - data = single_batch_with_batchsize_16_with_force + batch = single_batch_with_batchsize(batch_size=16, dataset_name="PHALKETHOH") + + data = batch true_E = data.metadata.E true_F = data.metadata.F @@ -158,10 +160,10 @@ def test_error_calculation(single_batch_with_batchsize_16_with_force): assert torch.allclose(F_error, reference_F_error) -def test_loss(single_batch_with_batchsize_16_with_force): +def test_loss(single_batch_with_batchsize): from modelforge.train.training import Loss - batch = single_batch_with_batchsize_16_with_force + batch = single_batch_with_batchsize(batch_size=16, dataset_name="PHALKETHOH") loss_porperty = ["per_molecule_energy", "per_atom_force"] loss_weights = {"per_molecule_energy": 0.5, "per_atom_force": 0.5} diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 97057941..84b19100 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -267,7 +267,7 @@ def __init__(self, loss_porperty: List[str], weight: Dict[str, float]): ) # FIXME: this is currently not working elif prop == "per_molecule_energy": self.loss[prop] = PerMoleculeSquaredError( - scale_by_number_of_atoms=False + scale_by_number_of_atoms=True ) self.register_buffer(prop, torch.tensor(w)) else: @@ -376,20 +376,18 @@ def create_error_metrics(loss_properties: List[str]) -> ModuleDict: class CalculateProperties(torch.nn.Module): - def __init__(self): + def __init__(self, requested_properties: List[str]): """ A utility class for calculating properties such as energies and forces from batches using a neural network model. - Methods - ------- - _get_forces(batch: BatchData, energies: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor] - Computes the forces from a given batch using the model. - _get_energies(batch: BatchData, model: Type[torch.nn.Module]) -> Dict[str, torch.Tensor] - Computes the energies from a given batch using the model. - forward(batch: BatchData, model: Type[torch.nn.Module]) -> Dict[str, torch.Tensor] - Computes the energies and forces from a given batch using the model. + Parameters + """ super().__init__() + self.requested_properties = requested_properties + self.include_force = False + if "force" in self.requested_properties: + self.include_force = True def _get_forces( self, batch: "BatchData", energies: Dict[str, torch.Tensor] @@ -490,7 +488,10 @@ def forward( The true and predicted energies and forces from the dataset and the model. """ energies = self._get_energies(batch, model) - forces = self._get_forces(batch, energies) + if self.include_force: + forces = self._get_forces(batch, energies) + else: + forces = {} return {**energies, **forces} @@ -542,7 +543,9 @@ def __init__( potential_seed=potential_seed, ) - self.calculate_predictions = CalculateProperties() + self.calculate_predictions = CalculateProperties( + training_parameter.loss_parameter.loss_property + ) self.optimizer = training_parameter.optimizer self.learning_rate = training_parameter.lr self.lr_scheduler = training_parameter.lr_scheduler @@ -659,10 +662,12 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: prog_bar=True, on_epoch=True, sync_dist=True, - batch_size=batch.batch_size(), + batch_size=1, # batch.batch_size(), ) - return torch.mean(loss_dict["total_loss"]) + loss = torch.mean(loss_dict["total_loss"]) + + return loss @torch.enable_grad() def validation_step(self, batch: "BatchData", batch_idx: int) -> None: @@ -685,10 +690,15 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None: batch.nnp_input.positions.requires_grad_(True) # calculate energy and forces predict_target = self.calculate_predictions(batch, self.potential) - # calculate the loss - loss = self.loss(predict_target, batch) - # log the loss self._update_metrics(self.val_error, predict_target) + # calculate the MSE with torch + l1 = torch.nn.functional.l1_loss( + predict_target["per_molecule_energy_predict"], + predict_target["per_molecule_energy_true"], + ) + + self.mae_validation_set += l1.item() + self.nr_of_batches += 1 @torch.enable_grad() def test_step(self, batch: "BatchData", batch_idx: int) -> None: @@ -798,6 +808,11 @@ def _log_on_epoch(self, log_mode: str = "train"): metrics, on_epoch=True, prog_bar=(phase == "val"), sync_dist=True ) + mse_loss = self.mse_training_set / self.nr_of_batches + mae_val = self.mae_validation_set / self.nr_of_batches + + a = 7 + def configure_optimizers(self): """ Configures the model's optimizers (and optionally schedulers). diff --git a/scripts/config.toml b/scripts/config.toml index 32e3018b..e0c84a82 100644 --- a/scripts/config.toml +++ b/scripts/config.toml @@ -2,11 +2,11 @@ potential_name = "SchNet" [potential.core_parameter] -number_of_radial_basis_functions = 20 +number_of_radial_basis_functions = 32 maximum_interaction_radius = "5.0 angstrom" -number_of_interaction_modules = 3 -number_of_filters = 32 -shared_interactions = false +number_of_interaction_modules = 5 +number_of_filters = 64 +shared_interactions = true [potential.core_parameter.activation_function_parameter] activation_function_name = "ShiftedSoftplus" @@ -14,7 +14,7 @@ activation_function_name = "ShiftedSoftplus" [potential.core_parameter.featurization] properties_to_featurize = ['atomic_number'] maximum_atomic_number = 101 -number_of_per_atom_features = 32 +number_of_per_atom_features = 128 [potential.postprocessing_parameter] [potential.postprocessing_parameter.per_atom_energy] @@ -31,10 +31,10 @@ num_workers = 4 pin_memory = true [training] -number_of_epochs = 4 +number_of_epochs = 1000 remove_self_energies = true -batch_size = 128 -lr = 1e-3 +batch_size = 16 +lr = 0.5e-3 monitor = "val/per_molecule_energy/rmse" [training.experiment_logger] @@ -56,11 +56,10 @@ monitor = "val/per_molecule_energy/rmse" interval = "epoch" [training.loss_parameter] -loss_property = ['per_molecule_energy', 'per_atom_force'] # use +loss_property = ['per_molecule_energy'] # use [training.loss_parameter.weight] -per_molecule_energy = 0.999 #NOTE: reciprocal units -per_atom_force = 0.001 +per_molecule_energy = 1.0 #NOTE: reciprocal units [training.early_stopping] From ba118ebbc83060d9b48773035cc958ed97c3e905 Mon Sep 17 00:00:00 2001 From: wiederm Date: Thu, 22 Aug 2024 16:28:02 +0200 Subject: [PATCH 17/66] update optimizer, fix bug --- modelforge/train/parameters.py | 2 +- modelforge/train/training.py | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/modelforge/train/parameters.py b/modelforge/train/parameters.py index 8fe6c2c9..1ea820d7 100644 --- a/modelforge/train/parameters.py +++ b/modelforge/train/parameters.py @@ -270,7 +270,7 @@ def ensure_logger_configuration(self) -> "ExperimentLogger": stochastic_weight_averaging: Optional[StochasticWeightAveraging] = None experiment_logger: ExperimentLogger verbose: bool = False - optimizer: Type[torch.optim.Optimizer] = torch.optim.Adam + optimizer: Type[torch.optim.Optimizer] = torch.optim.AdamW ### Runtime Parameters diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 84b19100..32671f1c 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -691,14 +691,7 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None: # calculate energy and forces predict_target = self.calculate_predictions(batch, self.potential) self._update_metrics(self.val_error, predict_target) - # calculate the MSE with torch - l1 = torch.nn.functional.l1_loss( - predict_target["per_molecule_energy_predict"], - predict_target["per_molecule_energy_true"], - ) - self.mae_validation_set += l1.item() - self.nr_of_batches += 1 @torch.enable_grad() def test_step(self, batch: "BatchData", batch_idx: int) -> None: @@ -808,10 +801,6 @@ def _log_on_epoch(self, log_mode: str = "train"): metrics, on_epoch=True, prog_bar=(phase == "val"), sync_dist=True ) - mse_loss = self.mse_training_set / self.nr_of_batches - mae_val = self.mae_validation_set / self.nr_of_batches - - a = 7 def configure_optimizers(self): """ From ae7ee910575c889cab2f88ab72e7ab9863bd49cf Mon Sep 17 00:00:00 2001 From: wiederm Date: Thu, 22 Aug 2024 19:00:33 +0200 Subject: [PATCH 18/66] use custom torchmetric to sync accross differnt nodes --- modelforge/train/training.py | 79 ++++++++++++++++++++++-------------- 1 file changed, 48 insertions(+), 31 deletions(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 32671f1c..8b9f2549 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -273,7 +273,9 @@ def __init__(self, loss_porperty: List[str], weight: Dict[str, float]): else: raise NotImplementedError(f"Loss type {prop} not implemented.") - def forward(self, predict_target: Dict[str, torch.Tensor], batch): + def forward( + self, predict_target: Dict[str, torch.Tensor], batch + ) -> Dict[str, torch.Tensor]: """ Calculates the combined loss for the specified properties. @@ -341,7 +343,7 @@ def create_loss(loss_property: List[str], weight: Dict[str, float]) -> Type[Loss from torch.nn import ModuleDict -def create_error_metrics(loss_properties: List[str]) -> ModuleDict: +def create_error_metrics(loss_properties: List[str], loss: bool = False) -> ModuleDict: """ Creates a ModuleDict of MetricCollections for the given loss properties. @@ -361,14 +363,36 @@ def create_error_metrics(loss_properties: List[str]) -> ModuleDict: ) from torchmetrics import MetricCollection - return ModuleDict( - { - prop: MetricCollection( - [MeanAbsoluteError(), MeanSquaredError(squared=False)] - ) - for prop in loss_properties - } - ) + if loss: + return ModuleDict({prop: MeanLossMetric() for prop in loss_properties}) + else: + return ModuleDict( + { + prop: MetricCollection( + [MeanAbsoluteError(), MeanSquaredError(squared=False)] + ) + for prop in loss_properties + } + ) + + +import torchmetrics + + +class MeanLossMetric(torchmetrics.Metric): + def __init__(self): + super().__init__() + self.add_state("sum_loss", default=torch.tensor(0.0), dist_reduce_fx="sum") + self.add_state("total_batches", default=torch.tensor(0), dist_reduce_fx="sum") + + def update(self, loss: torch.Tensor, batch_size: int): + # Accumulate the loss sum and batch count + self.sum_loss += loss.sum() + self.total_batches += batch_size + + def compute(self): + # Compute the mean loss + return self.sum_loss / self.total_batches from modelforge.train.parameters import RuntimeParameters, TrainingParameters @@ -573,6 +597,10 @@ def __init__( self.train_error = create_error_metrics( training_parameter.loss_parameter.loss_property ) + # Initialize your custom metric + self.train_loss_metric = create_error_metrics( + training_parameter.loss_parameter.loss_property, loss=True + ) def forward(self, batch: "BatchData") -> Dict[str, torch.Tensor]: """ @@ -645,29 +673,18 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: # force` predict_target = self.calculate_predictions(batch, self.potential) - # calculate the loss (for every entry in predict_target the squared - # error is calculated) + # Calculate the loss loss_dict = self.loss(predict_target, batch) - # Update and log training error (if requested) - if self.log_on_training_step: - self._update_metrics(self.train_error, predict_target) - - # log the loss (this includes the individual contributions that the loss contains) - for key, loss in loss_dict.items(): + # Update the custom metric with the different loss components + for key, value in loss_dict.items(): + self.train_loss_metric[key].update(value, batch.batch_size()) + # Log the metric instead of the mean loss directly self.log( - f"loss/{key}", - torch.mean(loss), - on_step=False, - prog_bar=True, - on_epoch=True, - sync_dist=True, - batch_size=1, # batch.batch_size(), + "loss/{key}", self.train_loss_metric[key], on_epoch=True, prog_bar=True ) - loss = torch.mean(loss_dict["total_loss"]) - - return loss + return torch.mean(loss_dict["total_loss"]) @torch.enable_grad() def validation_step(self, batch: "BatchData", batch_idx: int) -> None: @@ -692,7 +709,6 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None: predict_target = self.calculate_predictions(batch, self.potential) self._update_metrics(self.val_error, predict_target) - @torch.enable_grad() def test_step(self, batch: "BatchData", batch_idx: int) -> None: """ @@ -798,10 +814,11 @@ def _log_on_epoch(self, log_mode: str = "train"): metric.reset() # log dict, print val metrics to console self.log_dict( - metrics, on_epoch=True, prog_bar=(phase == "val"), sync_dist=True + metrics, + on_epoch=True, + prog_bar=(phase == "val"), ) - def configure_optimizers(self): """ Configures the model's optimizers (and optionally schedulers). From 0d61ccbb35372c4d2e20a8d0559540628a41ef2e Mon Sep 17 00:00:00 2001 From: wiederm Date: Fri, 23 Aug 2024 11:42:17 +0200 Subject: [PATCH 19/66] make logging consistent and log also loss with torchmetric (necessary for synchronized training) --- modelforge/train/training.py | 87 +++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 8b9f2549..307346a8 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -2,26 +2,32 @@ This module contains classes and functions for training neural network potentials using PyTorch Lightning. """ -from torch.optim.lr_scheduler import ReduceLROnPlateau -from typing import Any, Union, Dict, Type, Optional, List +from abc import ABC, abstractmethod +from typing import Any, Dict, List, Optional, Type, Union + +import lightning.pytorch as pL import torch -from loguru import logger as log -from modelforge.dataset.dataset import BatchData, NNPInput import torchmetrics +from lightning import Trainer +from loguru import logger as log from torch import nn -from abc import ABC, abstractmethod -from modelforge.dataset.dataset import DatasetParameters +from torch.optim.lr_scheduler import ReduceLROnPlateau + +from modelforge.dataset.dataset import ( + BatchData, + DataModule, + DatasetParameters, + NNPInput, +) from modelforge.potential.parameters import ( ANI2xParameters, - PhysNetParameters, - SchNetParameters, PaiNNParameters, + PhysNetParameters, SAKEParameters, + SchNetParameters, TensorNetParameters, ) -from lightning import Trainer -import lightning.pytorch as pL -from modelforge.dataset.dataset import DataModule +from modelforge.train.parameters import RuntimeParameters, TrainingParameters __all__ = [ "Error", @@ -306,7 +312,7 @@ def forward( # add total loss loss = loss + (self.weight[prop] * loss_) # save loss - loss_dict[f"{prop}/mse"] = loss_ + loss_dict[f"{prop}"] = loss_ # add total loss to results dict and return loss_dict["total_loss"] = loss @@ -339,8 +345,8 @@ def create_loss(loss_property: List[str], weight: Dict[str, float]) -> Type[Loss return Loss(loss_property, weight) -from torch.optim import Optimizer from torch.nn import ModuleDict +from torch.optim import Optimizer def create_error_metrics(loss_properties: List[str], loss: bool = False) -> ModuleDict: @@ -351,22 +357,25 @@ def create_error_metrics(loss_properties: List[str], loss: bool = False) -> Modu ---------- loss_properties : List[str] List of loss properties for which to create the metrics. - + loss : bool, optional + If True, only the loss metric is created, by default False. Returns ------- ModuleDict A dictionary where keys are loss properties and values are MetricCollections. """ - from torchmetrics.regression import ( - MeanAbsoluteError, - MeanSquaredError, - ) from torchmetrics import MetricCollection + from torchmetrics.regression import MeanAbsoluteError, MeanSquaredError + from torchmetrics.aggregation import MeanMetric if loss: - return ModuleDict({prop: MeanLossMetric() for prop in loss_properties}) + metric_dict = ModuleDict( + {prop: MetricCollection([MeanMetric()]) for prop in loss_properties} + ) + metric_dict["total_loss"] = MetricCollection([MeanMetric()]) + return metric_dict else: - return ModuleDict( + metric_dict = ModuleDict( { prop: MetricCollection( [MeanAbsoluteError(), MeanSquaredError(squared=False)] @@ -374,12 +383,13 @@ def create_error_metrics(loss_properties: List[str], loss: bool = False) -> Modu for prop in loss_properties } ) + return metric_dict -import torchmetrics +from torchmetrics import Metric -class MeanLossMetric(torchmetrics.Metric): +class MeanLossMetric(Metric): def __init__(self): super().__init__() self.add_state("sum_loss", default=torch.tensor(0.0), dist_reduce_fx="sum") @@ -395,9 +405,6 @@ def compute(self): return self.sum_loss / self.total_batches -from modelforge.train.parameters import RuntimeParameters, TrainingParameters - - class CalculateProperties(torch.nn.Module): def __init__(self, requested_properties: List[str]): @@ -597,8 +604,9 @@ def __init__( self.train_error = create_error_metrics( training_parameter.loss_parameter.loss_property ) - # Initialize your custom metric - self.train_loss_metric = create_error_metrics( + + # Initialize loss metric + self.loss_metric = create_error_metrics( training_parameter.loss_parameter.loss_property, loss=True ) @@ -676,13 +684,9 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: # Calculate the loss loss_dict = self.loss(predict_target, batch) - # Update the custom metric with the different loss components - for key, value in loss_dict.items(): - self.train_loss_metric[key].update(value, batch.batch_size()) - # Log the metric instead of the mean loss directly - self.log( - "loss/{key}", self.train_loss_metric[key], on_epoch=True, prog_bar=True - ) + # Update the loss metric with the different loss components + for key, metric in loss_dict.items(): + self.loss_metric[key].update(metric, batch.batch_size()) return torch.mean(loss_dict["total_loss"]) @@ -793,6 +797,7 @@ def _log_on_epoch(self, log_mode: str = "train"): errors = [ ("train", self.train_error), ("val", self.val_error), + ("loss", self.loss_metric), ] elif log_mode == "test": errors = [ @@ -809,7 +814,7 @@ def _log_on_epoch(self, log_mode: str = "train"): metrics = {} for property, metrics_dict in error_dict.items(): for name, metric in metrics_dict.items(): - name = f"{phase}/{property}/{conv[name]}" + name = f"{phase}/{property}/{conv.get(name, name)}" metrics[name] = metric.compute() metric.reset() # log dict, print val metrics to console @@ -974,8 +979,8 @@ def setup_datamodule(self) -> DataModule: DataModule Configured DataModule instance. """ - from modelforge.dataset.utils import REGISTERED_SPLITTING_STRATEGIES from modelforge.dataset.dataset import DataModule + from modelforge.dataset.utils import REGISTERED_SPLITTING_STRATEGIES dm = DataModule( name=self.dataset_parameter.dataset_name, @@ -1074,8 +1079,8 @@ def setup_callbacks(self) -> List[Any]: List of configured callbacks. """ from lightning.pytorch.callbacks import ( - ModelCheckpoint, EarlyStopping, + ModelCheckpoint, StochasticWeightAveraging, ) @@ -1309,9 +1314,9 @@ def read_config( runtime_config_dict[key] = value # Load and instantiate the data classes with the merged configuration - from modelforge.potential import _Implemented_NNP_Parameters from modelforge.dataset.dataset import DatasetParameters - from modelforge.train.parameters import TrainingParameters, RuntimeParameters + from modelforge.potential import _Implemented_NNP_Parameters + from modelforge.train.parameters import RuntimeParameters, TrainingParameters potential_name = potential_config_dict["potential_name"] PotentialParameters = ( @@ -1409,9 +1414,7 @@ def read_config_and_train( log_every_n_steps=log_every_n_steps, simulation_environment=simulation_environment, ) - from modelforge.potential.models import ( - NeuralNetworkPotentialFactory, - ) + from modelforge.potential.models import NeuralNetworkPotentialFactory model = NeuralNetworkPotentialFactory.generate_potential( use="training", From a62029c7ca3e40aa2c65420fe2d87a9c0cdf2f84 Mon Sep 17 00:00:00 2001 From: wiederm Date: Fri, 23 Aug 2024 11:48:50 +0200 Subject: [PATCH 20/66] skip if ANI and SPICE --- modelforge/tests/test_training.py | 4 ++++ modelforge/train/training.py | 1 + 2 files changed, 5 insertions(+) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index b50f6454..f39fb79c 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -85,7 +85,11 @@ def test_train_with_lightning(potential_name, dataset_name): """ Test that we can train, save and load checkpoints. """ + # train potential + # SKIP if potential is ANI and dataset is SPICE2 + if potential_name == "ANI" and dataset_name == "SPICE2": + pytest.skip("ANI potential is not compatible with SPICE2 dataset") get_trainer(potential_name, dataset_name).train_potential().save_checkpoint( "test.chp" ) # save checkpoint diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 307346a8..7d44b95f 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -790,6 +790,7 @@ def _log_on_epoch(self, log_mode: str = "train"): conv = { "MeanAbsoluteError": "mae", "MeanSquaredError": "rmse", + "MeanMetric": "mse", # NOTE: MeanMetric is the MSE since we accumulate the squared error } # NOTE: MeanSquaredError(squared=False) is RMSE # Log all accumulated metrics for train and val phases From ac9e853afd1111e4d4781af814599e0dda70e485 Mon Sep 17 00:00:00 2001 From: wiederm Date: Fri, 23 Aug 2024 13:19:05 +0200 Subject: [PATCH 21/66] include force training test --- .gitignore | 4 ++ .../training_defaults/default_with_force.toml | 54 +++++++++++++++++++ modelforge/tests/test_training.py | 25 +++++---- modelforge/train/training.py | 4 +- scripts/config.toml | 12 ++--- 5 files changed, 82 insertions(+), 17 deletions(-) create mode 100644 modelforge/tests/data/training_defaults/default_with_force.toml diff --git a/.gitignore b/.gitignore index 2ac2cee4..3c452056 100644 --- a/.gitignore +++ b/.gitignore @@ -190,3 +190,7 @@ lightning_logs/ *.hdf5 */tb_logs/* .vscode/settings.json +logs/* +cache/* +*/logs/* +*/cache/* diff --git a/modelforge/tests/data/training_defaults/default_with_force.toml b/modelforge/tests/data/training_defaults/default_with_force.toml new file mode 100644 index 00000000..19c0d3b7 --- /dev/null +++ b/modelforge/tests/data/training_defaults/default_with_force.toml @@ -0,0 +1,54 @@ +[training] +number_of_epochs = 2 +remove_self_energies = true +batch_size = 128 +lr = 1e-3 +monitor = "val/per_molecule_energy/rmse" + + +[training.experiment_logger] +logger_name = "tensorboard" # this will set which logger to use + +# configuration for both loggers can be defined simultaneously, the logger_name variable defines which logger to use +[training.experiment_logger.tensorboard_configuration] +save_dir = "logs" + +[training.experiment_logger.wandb_configuration] +save_dir = "logs" +project = "training_potentials" +group = "exp00" +log_model = true +job_type = "testing" +tags = ["v_0.1.0"] +notes = "testing training" + +[training.lr_scheduler] +frequency = 1 +mode = "min" +factor = 0.1 +patience = 10 +cooldown = 5 +min_lr = 1e-8 +threshold = 0.1 +threshold_mode = "abs" +monitor = "val/per_molecule_energy/rmse" +interval = "epoch" + +[training.loss_parameter] +loss_property = ['per_molecule_energy', 'per_atom_force'] # use + +[training.loss_parameter.weight] +per_molecule_energy = 0.999 #NOTE: reciprocal units +per_atom_force = 0.001 + + +[training.early_stopping] +verbose = true +monitor = "val/per_molecule_energy/rmse" +min_delta = 0.001 +patience = 50 + +[training.splitting_strategy] +name = "random_record_splitting_strategy" +data_split = [0.8, 0.1, 0.1] +seed = 42 diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index f39fb79c..bec6e422 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -10,7 +10,9 @@ from modelforge.potential import NeuralNetworkPotentialFactory, _Implemented_NNPs -def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): +def load_configs_into_pydantic_models( + potential_name: str, dataset_name: str, training_toml: str +): from importlib import resources import toml @@ -26,7 +28,7 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): resources.files(potential_defaults) / f"{potential_name.lower()}.toml" ) dataset_path = resources.files(dataset_defaults) / f"{dataset_name.lower()}.toml" - training_path = resources.files(training_defaults) / "default.toml" + training_path = resources.files(training_defaults) / f"{training_toml}.toml" runtime_path = resources.files(runtime_defaults) / "runtime.toml" training_config_dict = toml.load(training_path) @@ -58,8 +60,10 @@ def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): } -def get_trainer(potential_name: str, dataset_name: str): - config = load_configs_into_pydantic_models(potential_name, dataset_name) +def get_trainer(potential_name: str, dataset_name: str, training_toml: str): + config = load_configs_into_pydantic_models( + potential_name, dataset_name, training_toml + ) # Extract parameters potential_parameter = config["potential"] @@ -81,21 +85,24 @@ def get_trainer(potential_name: str, dataset_name: str): "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) @pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"]) -def test_train_with_lightning(potential_name, dataset_name): +@pytest.mark.parametrize("training", ["with_force", "without_force"]) +def test_train_with_lightning(training, potential_name, dataset_name): """ Test that we can train, save and load checkpoints. """ # train potential - + training_toml = "default_with_force" if training == "with_force" else "default" # SKIP if potential is ANI and dataset is SPICE2 - if potential_name == "ANI" and dataset_name == "SPICE2": + if "ANI" in potential_name and dataset_name == "SPICE2": pytest.skip("ANI potential is not compatible with SPICE2 dataset") - get_trainer(potential_name, dataset_name).train_potential().save_checkpoint( + get_trainer( + potential_name, dataset_name, training_toml + ).train_potential().save_checkpoint( "test.chp" ) # save checkpoint # continue training from checkpoint - get_trainer(potential_name, dataset_name).train_potential() + get_trainer(potential_name, dataset_name, training_toml).train_potential() def test_train_from_single_toml_file(): diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 7d44b95f..f3ec2c33 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -417,7 +417,7 @@ def __init__(self, requested_properties: List[str]): super().__init__() self.requested_properties = requested_properties self.include_force = False - if "force" in self.requested_properties: + if "per_atom_force" in self.requested_properties: self.include_force = True def _get_forces( @@ -822,7 +822,7 @@ def _log_on_epoch(self, log_mode: str = "train"): self.log_dict( metrics, on_epoch=True, - prog_bar=(phase == "val"), + prog_bar=(phase == "val" or phase == "loss"), ) def configure_optimizers(self): diff --git a/scripts/config.toml b/scripts/config.toml index e0c84a82..037437c6 100644 --- a/scripts/config.toml +++ b/scripts/config.toml @@ -25,7 +25,7 @@ keep_per_atom_property = true calculate_molecular_self_energy = true [dataset] -dataset_name = "QM9" +dataset_name = "PHALKETHOH" version_select = "nc_1000_v0" num_workers = 4 pin_memory = true @@ -56,11 +56,11 @@ monitor = "val/per_molecule_energy/rmse" interval = "epoch" [training.loss_parameter] -loss_property = ['per_molecule_energy'] # use +loss_property = ['per_molecule_energy', 'per_atom_force'] # use [training.loss_parameter.weight] -per_molecule_energy = 1.0 #NOTE: reciprocal units - +per_molecule_energy = 0.009 #NOTE: reciprocal units +per_atom_force = 0.001 [training.early_stopping] verbose = true @@ -74,12 +74,12 @@ data_split = [0.8, 0.1, 0.1] seed = 42 [runtime] -save_dir = "lightning_logs" +save_dir = "test_setup" experiment_name = "{potential_name}_{dataset_name}" local_cache_dir = "./cache" accelerator = "cpu" number_of_nodes = 1 -devices = 1 #[0,1,2,3] +devices = 1 #[0,1,2,3] checkpoint_path = "None" simulation_environment = "PyTorch" log_every_n_steps = 1 From 6ae7ca051e09f05aa5c9a7bab5fd4ba6869fa6a9 Mon Sep 17 00:00:00 2001 From: wiederm Date: Fri, 23 Aug 2024 17:40:02 +0200 Subject: [PATCH 22/66] still issues with mutliple GPUs --- modelforge/train/training.py | 9 +-------- scripts/config.toml | 24 +++++++++++------------- 2 files changed, 12 insertions(+), 21 deletions(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index f3ec2c33..1ab1d3c4 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -812,18 +812,11 @@ def _log_on_epoch(self, log_mode: str = "train"): if phase == "train" and not self.log_on_training_step: continue - metrics = {} for property, metrics_dict in error_dict.items(): for name, metric in metrics_dict.items(): name = f"{phase}/{property}/{conv.get(name, name)}" - metrics[name] = metric.compute() + self.log(name, metric.compute(), prog_bar=True) metric.reset() - # log dict, print val metrics to console - self.log_dict( - metrics, - on_epoch=True, - prog_bar=(phase == "val" or phase == "loss"), - ) def configure_optimizers(self): """ diff --git a/scripts/config.toml b/scripts/config.toml index 037437c6..d6e92a10 100644 --- a/scripts/config.toml +++ b/scripts/config.toml @@ -1,28 +1,26 @@ [potential] -potential_name = "SchNet" +potential_name = "ANI2x" [potential.core_parameter] -number_of_radial_basis_functions = 32 -maximum_interaction_radius = "5.0 angstrom" -number_of_interaction_modules = 5 -number_of_filters = 64 -shared_interactions = true +angle_sections = 4 +maximum_interaction_radius = "5.1 angstrom" +minimum_interaction_radius = "0.8 angstrom" +number_of_radial_basis_functions = 16 +maximum_interaction_radius_for_angular_features = "3.5 angstrom" +minimum_interaction_radius_for_angular_features = "0.8 angstrom" +angular_dist_divisions = 8 [potential.core_parameter.activation_function_parameter] -activation_function_name = "ShiftedSoftplus" +activation_function_name = "CeLU" # for the original ANI behavior please stick with CeLu since the alpha parameter is currently hard coded and might lead to different behavior when another activation function is used. -[potential.core_parameter.featurization] -properties_to_featurize = ['atomic_number'] -maximum_atomic_number = 101 -number_of_per_atom_features = 128 +[potential.core_parameter.activation_function_parameter.activation_function_arguments] +alpha = 0.1 [potential.postprocessing_parameter] [potential.postprocessing_parameter.per_atom_energy] normalize = true from_atom_to_molecule_reduction = true keep_per_atom_property = true -[potential.postprocessing_parameter.general_postprocessing_operation] -calculate_molecular_self_energy = true [dataset] dataset_name = "PHALKETHOH" From ebba646b2e8696cec43c84bb248d96ec2b61d01b Mon Sep 17 00:00:00 2001 From: wiederm Date: Fri, 23 Aug 2024 19:10:59 +0200 Subject: [PATCH 23/66] make loss tensor's stride contiguous --- modelforge/tests/test_training.py | 4 +++- modelforge/train/training.py | 16 +++++++++++++++- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index bec6e422..545b31c0 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -93,7 +93,7 @@ def test_train_with_lightning(training, potential_name, dataset_name): # train potential training_toml = "default_with_force" if training == "with_force" else "default" # SKIP if potential is ANI and dataset is SPICE2 - if "ANI" in potential_name and dataset_name == "SPICE2": + if "ANI" in potential_name and dataset_name == "SPICE2": pytest.skip("ANI potential is not compatible with SPICE2 dataset") get_trainer( potential_name, dataset_name, training_toml @@ -104,6 +104,8 @@ def test_train_with_lightning(training, potential_name, dataset_name): # continue training from checkpoint get_trainer(potential_name, dataset_name, training_toml).train_potential() + assert False + def test_train_from_single_toml_file(): from importlib import resources diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 1ab1d3c4..a9dc3bf3 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -574,6 +574,19 @@ def __init__( potential_seed=potential_seed, ) + # def check_strides(module, grad_input, grad_output): + # print(f"Layer: {module.__class__.__name__}") + + # for i, grad in enumerate(grad_input): + # if grad is not None: + # print( + # f"Grad input {i}: size {grad.size()}, strides {grad.stride()}" + # ) + + # # Register the hook + # for module in self.potential.modules(): + # module.register_backward_hook(check_strides) + self.calculate_predictions = CalculateProperties( training_parameter.loss_parameter.loss_property ) @@ -688,7 +701,8 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: for key, metric in loss_dict.items(): self.loss_metric[key].update(metric, batch.batch_size()) - return torch.mean(loss_dict["total_loss"]) + loss = torch.mean(loss_dict["total_loss"]).contiguous() + return loss @torch.enable_grad() def validation_step(self, batch: "BatchData", batch_idx: int) -> None: From a315724f28d9a642f3a938af00ca18e28f3303ed Mon Sep 17 00:00:00 2001 From: wiederm Date: Fri, 23 Aug 2024 19:16:50 +0200 Subject: [PATCH 24/66] sync log --- modelforge/train/training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index a9dc3bf3..e88a1768 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -829,7 +829,7 @@ def _log_on_epoch(self, log_mode: str = "train"): for property, metrics_dict in error_dict.items(): for name, metric in metrics_dict.items(): name = f"{phase}/{property}/{conv.get(name, name)}" - self.log(name, metric.compute(), prog_bar=True) + self.log(name, metric.compute(), prog_bar=True, sync_dist=True) metric.reset() def configure_optimizers(self): From c5b87be391bdbbfaac31ceeea37078b03af04fab Mon Sep 17 00:00:00 2001 From: wiederm Date: Sat, 24 Aug 2024 08:14:06 +0200 Subject: [PATCH 25/66] stride is an issue in the backward pass through the forces, this m might fix it --- modelforge/train/training.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index e88a1768..e2d814f3 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -463,7 +463,7 @@ def _get_forces( return { "per_atom_force_true": per_atom_force_true, - "per_atom_force_predict": per_atom_force_predict, + "per_atom_force_predict": per_atom_force_predict.contiguous(), } def _get_energies( @@ -1128,6 +1128,10 @@ def setup_trainer(self) -> Trainer: """ from lightning import Trainer + # if devices is a list + if isinstance(self.runtime_parameter.devices, list): + strategy = "ddp" + trainer = Trainer( max_epochs=self.training_parameter.number_of_epochs, num_nodes=self.runtime_parameter.number_of_nodes, From 8da97be4ac322ee91270425120e3baa8be0a11c3 Mon Sep 17 00:00:00 2001 From: wiederm Date: Sat, 24 Aug 2024 08:45:34 +0200 Subject: [PATCH 26/66] still stride issue --- modelforge/tests/test_training.py | 6 +++++- modelforge/train/training.py | 22 +++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 545b31c0..891d9169 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -80,6 +80,8 @@ def get_trainer(potential_name: str, dataset_name: str, training_toml: str): ) + + @pytest.mark.skipif(ON_MACOS, reason="Skipping this test on MacOS GitHub Actions") @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() @@ -90,11 +92,13 @@ def test_train_with_lightning(training, potential_name, dataset_name): """ Test that we can train, save and load checkpoints. """ - # train potential + # get correct training toml training_toml = "default_with_force" if training == "with_force" else "default" # SKIP if potential is ANI and dataset is SPICE2 if "ANI" in potential_name and dataset_name == "SPICE2": pytest.skip("ANI potential is not compatible with SPICE2 dataset") + + # train potential get_trainer( potential_name, dataset_name, training_toml ).train_potential().save_checkpoint( diff --git a/modelforge/train/training.py b/modelforge/train/training.py index e2d814f3..b79d7b84 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -169,7 +169,7 @@ def forward( 0, batch.nnp_input.atomic_subsystem_indices.long().unsqueeze(1), per_atom_squared_error, - ) + ).contiguous() # divide by number of atoms per_molecule_square_error_scaled = self.scale_by_number_of_atoms( per_molecule_squared_error, @@ -574,18 +574,18 @@ def __init__( potential_seed=potential_seed, ) - # def check_strides(module, grad_input, grad_output): - # print(f"Layer: {module.__class__.__name__}") + def check_strides(module, grad_input, grad_output): + print(f"Layer: {module.__class__.__name__}") - # for i, grad in enumerate(grad_input): - # if grad is not None: - # print( - # f"Grad input {i}: size {grad.size()}, strides {grad.stride()}" - # ) + for i, grad in enumerate(grad_input): + if grad is not None: + print( + f"Grad input {i}: size {grad.size()}, strides {grad.stride()}" + ) - # # Register the hook - # for module in self.potential.modules(): - # module.register_backward_hook(check_strides) + # Register the hook + for module in self.potential.modules(): + module.register_backward_hook(check_strides) self.calculate_predictions = CalculateProperties( training_parameter.loss_parameter.loss_property From e4ce8d66eda970b472a059996f6370782da6eee4 Mon Sep 17 00:00:00 2001 From: wiederm Date: Sat, 24 Aug 2024 14:59:32 +0200 Subject: [PATCH 27/66] add stride hook for backward --- modelforge/train/training.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index b79d7b84..0a870f7d 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -175,7 +175,7 @@ def forward( per_molecule_squared_error, batch.metadata.atomic_subsystem_counts, prefactor=per_atom_prediction.shape[-1], - ) + ).contiguous() return per_molecule_square_error_scaled @@ -542,6 +542,7 @@ def __init__( dataset_statistic: Dict[str, float], training_parameter: TrainingParameters, potential_seed: Optional[int] = None, + debugging: bool = True, ): """ Initializes the TrainingAdapter with the specified model and training configuration. @@ -576,16 +577,21 @@ def __init__( def check_strides(module, grad_input, grad_output): print(f"Layer: {module.__class__.__name__}") - for i, grad in enumerate(grad_input): if grad is not None: print( f"Grad input {i}: size {grad.size()}, strides {grad.stride()}" ) + for i, grad in enumerate(grad_output): + if grad is not None: + print( + f"Grad output {i}: size {grad.size()}, strides {grad.stride()}" + ) - # Register the hook - for module in self.potential.modules(): - module.register_backward_hook(check_strides) + # Register the full backward hook + if debugging is True: + for module in self.potential.modules(): + module.register_full_backward_hook(check_strides) self.calculate_predictions = CalculateProperties( training_parameter.loss_parameter.loss_property From 3f087ab2e64a5410e376669156a842a708a838f6 Mon Sep 17 00:00:00 2001 From: wiederm Date: Sat, 24 Aug 2024 15:25:14 +0200 Subject: [PATCH 28/66] dicst as module output for stride --- modelforge/train/training.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 0a870f7d..caeaac59 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -582,11 +582,22 @@ def check_strides(module, grad_input, grad_output): print( f"Grad input {i}: size {grad.size()}, strides {grad.stride()}" ) - for i, grad in enumerate(grad_output): - if grad is not None: - print( - f"Grad output {i}: size {grad.size()}, strides {grad.stride()}" - ) + # Handle grad_output + if isinstance(grad_output, tuple) and isinstance(grad_output[0], dict): + # If the output is a dict wrapped in a tuple, extract the dict + grad_output = grad_output[0] + if isinstance(grad_output, dict): + for key, grad in grad_output.items(): + if grad is not None: + print( + f"Grad output [{key}]: size {grad.size()}, strides {grad.stride()}" + ) + else: + for i, grad in enumerate(grad_output): + if grad is not None: + print( + f"Grad output {i}: size {grad.size()}, strides {grad.stride()}" + ) # Register the full backward hook if debugging is True: From 507209868a14c7b65de5084d96ecf22b5d21d29f Mon Sep 17 00:00:00 2001 From: wiederm Date: Sun, 25 Aug 2024 19:26:26 +0200 Subject: [PATCH 29/66] avoid saving grad in val/test routine --- modelforge/tests/test_training.py | 19 ++++++++----- modelforge/train/training.py | 45 +++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 21 deletions(-) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 891d9169..4acd629b 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -80,8 +80,6 @@ def get_trainer(potential_name: str, dataset_name: str, training_toml: str): ) - - @pytest.mark.skipif(ON_MACOS, reason="Skipping this test on MacOS GitHub Actions") @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() @@ -108,8 +106,6 @@ def test_train_with_lightning(training, potential_name, dataset_name): # continue training from checkpoint get_trainer(potential_name, dataset_name, training_toml).train_potential() - assert False - def test_train_from_single_toml_file(): from importlib import resources @@ -188,8 +184,17 @@ def test_loss(single_batch_with_batchsize): assert loss is not None # get trainer - trainer = get_trainer("schnet", "QM9") - prediction = trainer.model.calculate_predictions(batch, trainer.model.potential) + trainer = get_trainer("schnet", "QM9", "default_with_force") + prediction = trainer.model.calculate_predictions( + batch, trainer.model.potential, train_mode=True + ) # train_mode=True is required for gradients in force prediction + + assert prediction["per_molecule_energy_predict"].size( + dim=0 + ) == batch.metadata.E.size(dim=0) + assert prediction["per_molecule_force_predict"].size( + dim=0 + ) == batch.metadata.E.size(dim=0) # pass prediction through loss module loss_output = loss(prediction, batch) @@ -209,7 +214,7 @@ def test_loss(single_batch_with_batchsize): ).pow(2) ) ) - assert torch.allclose(loss_output["per_molecule_energy/mse"], E_loss) + assert torch.allclose(loss_output["per_molecule_energy"], E_loss) # --------------------------------------------- # # now calculate F_loss diff --git a/modelforge/train/training.py b/modelforge/train/training.py index caeaac59..7243e819 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -421,7 +421,7 @@ def __init__(self, requested_properties: List[str]): self.include_force = True def _get_forces( - self, batch: "BatchData", energies: Dict[str, torch.Tensor] + self, batch: "BatchData", energies: Dict[str, torch.Tensor], train_mode: bool ) -> Dict[str, torch.Tensor]: """ Computes the forces from a given batch using the model. @@ -454,11 +454,17 @@ def _get_forces( # Compute the gradient (forces) from the predicted energies grad = torch.autograd.grad( - per_molecule_energy_predict.sum(), + per_molecule_energy_predict, nnp_input.positions, - create_graph=True, - retain_graph=True, + grad_outputs=torch.ones_like(per_molecule_energy_predict), + create_graph=train_mode, + retain_graph=train_mode, + allow_unused=True, )[0] + + if grad is None: + raise RuntimeWarning("Force calculation did not return a gradient") + per_atom_force_predict = -1 * grad # Forces are the negative gradient of energy return { @@ -501,7 +507,7 @@ def _get_energies( } def forward( - self, batch: "BatchData", model: Type[torch.nn.Module] + self, batch: "BatchData", model: Type[torch.nn.Module], train_mode: bool = False ) -> Dict[str, torch.Tensor]: """ Computes the energies and forces from a given batch using the model. @@ -520,7 +526,7 @@ def forward( """ energies = self._get_energies(batch, model) if self.include_force: - forces = self._get_forces(batch, energies) + forces = self._get_forces(batch, energies, train_mode) else: forces = {} return {**energies, **forces} @@ -542,7 +548,6 @@ def __init__( dataset_statistic: Dict[str, float], training_parameter: TrainingParameters, potential_seed: Optional[int] = None, - debugging: bool = True, ): """ Initializes the TrainingAdapter with the specified model and training configuration. @@ -600,10 +605,14 @@ def check_strides(module, grad_input, grad_output): ) # Register the full backward hook - if debugging is True: + if training_parameter.verbose is True: for module in self.potential.modules(): module.register_full_backward_hook(check_strides) + self.include_force = False + if "per_atom_force" in training_parameter.loss_parameter.loss_property: + self.include_force = True + self.calculate_predictions = CalculateProperties( training_parameter.loss_parameter.loss_property ) @@ -709,7 +718,9 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: # calculate energy and forces, Note that `predict_target` is a # dictionary containing the predicted and true values for energy and # force` - predict_target = self.calculate_predictions(batch, self.potential) + predict_target = self.calculate_predictions( + batch, self.potential, self.training + ) # Calculate the loss loss_dict = self.loss(predict_target, batch) @@ -721,7 +732,6 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: loss = torch.mean(loss_dict["total_loss"]).contiguous() return loss - @torch.enable_grad() def validation_step(self, batch: "BatchData", batch_idx: int) -> None: """ Validation step to compute the RMSE/MAE across epochs. @@ -740,11 +750,15 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None: # Ensure positions require gradients for force calculation batch.nnp_input.positions.requires_grad_(True) - # calculate energy and forces - predict_target = self.calculate_predictions(batch, self.potential) + with torch.inference_mode(False): + + # calculate energy and forces + predict_target = self.calculate_predictions( + batch, self.potential, self.training + ) + self._update_metrics(self.val_error, predict_target) - @torch.enable_grad() def test_step(self, batch: "BatchData", batch_idx: int) -> None: """ Test step to compute the RMSE loss for a given batch. @@ -767,7 +781,10 @@ def test_step(self, batch: "BatchData", batch_idx: int) -> None: # Ensure positions require gradients for force calculation batch.nnp_input.positions.requires_grad_(True) # calculate energy and forces - predict_target = self.calculate_predictions(batch, self.potential) + with torch.inference_mode(False): + predict_target = self.calculate_predictions( + batch, self.potential, self.training + ) # Update and log metrics self._update_metrics(self.test_error, predict_target) From 5895acad99958093cdc8af657899e68e6b17f6c9 Mon Sep 17 00:00:00 2001 From: wiederm Date: Sun, 25 Aug 2024 19:35:10 +0200 Subject: [PATCH 30/66] only linting changes --- modelforge/dataset/dataset.py | 4 ++-- modelforge/tests/test_dataset.py | 4 ++-- modelforge/tests/test_models.py | 3 +-- modelforge/train/training.py | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index 440c867b..8c78ab32 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -237,10 +237,10 @@ def to( self.metadata = self.metadata.to(device=device, dtype=dtype) return self - def batch_size(self): return self.metadata.E.size(dim=0) - + + class TorchDataset(torch.utils.data.Dataset[BatchData]): """ Wraps a numpy dataset to make it compatible with PyTorch DataLoader. diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py index b591fa30..91f963a3 100644 --- a/modelforge/tests/test_dataset.py +++ b/modelforge/tests/test_dataset.py @@ -461,7 +461,7 @@ def test_data_item_format_of_datamodule( def test_dataset_neighborlist(potential_name, single_batch_with_batchsize): """Test the neighborlist.""" - batch = single_batch_with_batchsize(64, 'QM9') + batch = single_batch_with_batchsize(64, "QM9") nnp_input = batch.nnp_input # test that the neighborlist is correctly generated @@ -714,7 +714,7 @@ def test_numpy_dataset_assignment(dataset_name): def test_energy_postprocessing(): - # test that the mean and stddev of the dataset + # test that the mean and stddev of the dataset # are correct from modelforge.dataset.dataset import DataModule diff --git a/modelforge/tests/test_models.py b/modelforge/tests/test_models.py index 58602e31..e536b1b0 100644 --- a/modelforge/tests/test_models.py +++ b/modelforge/tests/test_models.py @@ -450,7 +450,6 @@ def test_forward_pass( output = model(nnp_input) - # test that we get an energie per molecule assert len(output["per_molecule_energy"]) == nr_of_mols @@ -459,7 +458,7 @@ def test_forward_pass( # This has to be reflected in the atomic energies E_i, which # have to be equal for all hydrogens if "JAX" not in str(type(model)) and dataset_name == "QM9": - # make sure that we are correctly reducing + # make sure that we are correctly reducing ref = torch.zeros_like(output["per_molecule_energy"]).scatter_add_( 0, nnp_input.atomic_subsystem_indices.long(), output["per_atom_energy"] ) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 7243e819..076a121c 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -756,7 +756,7 @@ def validation_step(self, batch: "BatchData", batch_idx: int) -> None: predict_target = self.calculate_predictions( batch, self.potential, self.training ) - + self._update_metrics(self.val_error, predict_target) def test_step(self, batch: "BatchData", batch_idx: int) -> None: From 701122d2793f2cdab40f9191ce8c82fc360dd21b Mon Sep 17 00:00:00 2001 From: wiederm Date: Sun, 25 Aug 2024 22:59:21 +0200 Subject: [PATCH 31/66] fix loss test --- modelforge/tests/test_training.py | 15 +++++++++------ modelforge/train/training.py | 3 ++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 4acd629b..38c2ba24 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -192,9 +192,9 @@ def test_loss(single_batch_with_batchsize): assert prediction["per_molecule_energy_predict"].size( dim=0 ) == batch.metadata.E.size(dim=0) - assert prediction["per_molecule_force_predict"].size( + assert prediction["per_atom_force_predict"].size(dim=0) == batch.metadata.F.size( dim=0 - ) == batch.metadata.E.size(dim=0) + ) # pass prediction through loss module loss_output = loss(prediction, batch) @@ -212,9 +212,12 @@ def test_loss(single_batch_with_batchsize): prediction["per_molecule_energy_predict"] - prediction["per_molecule_energy_true"] ).pow(2) + / batch.metadata.atomic_subsystem_counts.unsqueeze(1) ) ) - assert torch.allclose(loss_output["per_molecule_energy"], E_loss) + # compare to referenc evalue obtained from Loos class + ref = torch.mean(loss_output["per_molecule_energy"]) + assert torch.allclose(ref, E_loss) # --------------------------------------------- # # now calculate F_loss @@ -239,15 +242,15 @@ def test_loss(single_batch_with_batchsize): ) per_atom_force_mse = torch.mean(per_molecule_squared_error) - assert torch.allclose(loss_output["per_atom_force/mse"], per_atom_force_mse) + assert torch.allclose(torch.mean(loss_output["per_atom_force"]), per_atom_force_mse) # --------------------------------------------- # # let's double check that the loss is calculated correctly # calculate the total loss assert torch.allclose( - loss_weights["per_molecule_energy"] * loss_output["per_molecule_energy/mse"] - + loss_weights["per_atom_force"] * loss_output["per_atom_force/mse"], + loss_weights["per_molecule_energy"] * loss_output["per_molecule_energy"] + + loss_weights["per_atom_force"] * loss_output["per_atom_force"], loss_output["total_loss"].to(torch.float32), ) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 076a121c..c9d36ec9 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -83,7 +83,8 @@ def calculate_squared_error( torch.Tensor The calculated error. """ - error = (predicted_tensor - reference_tensor).pow(2).sum(dim=1, keepdim=True) + squared_diff = (predicted_tensor - reference_tensor).pow(2) + error = squared_diff.sum(dim=1, keepdim=True) return error @staticmethod From 42dc20a9da92786ddc576d459d0794dd30a8a20c Mon Sep 17 00:00:00 2001 From: wiederm Date: Mon, 26 Aug 2024 07:04:22 +0200 Subject: [PATCH 32/66] fix tests --- modelforge/tests/test_parameter_models.py | 7 +++++-- modelforge/tests/test_sake.py | 2 +- modelforge/tests/test_training.py | 4 ++-- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/modelforge/tests/test_parameter_models.py b/modelforge/tests/test_parameter_models.py index ce97537d..2c26e2cd 100644 --- a/modelforge/tests/test_parameter_models.py +++ b/modelforge/tests/test_parameter_models.py @@ -142,6 +142,9 @@ def test_training_parameter_model(): with pytest.raises(ValidationError): training_parameters.splitting_strategy.dataset_split = [0.7, 0.1, 0.1, 0.1] - # this will throw an error because the datafile has 2 entries for the loss_property dictionary + # this will throw an error because the datafile has 1 entries for the loss_property dictionary with pytest.raises(ValidationError): - training_parameters.loss_parameter.loss_property = ["per_molecule_energy"] + training_parameters.loss_parameter.loss_property = [ + "per_molecule_energy", + "per_atom_force", + ] diff --git a/modelforge/tests/test_sake.py b/modelforge/tests/test_sake.py index b3f36ee1..994b7112 100644 --- a/modelforge/tests/test_sake.py +++ b/modelforge/tests/test_sake.py @@ -624,7 +624,7 @@ def test_model_invariance(single_batch_with_batchsize): ], ) # get methane input - batch = single_batch_with_batchsize(batch_size=1) + batch = single_batch_with_batchsize(batch_size=1, dataset_name="QM9") methane = batch.nnp_input rotation_matrix = torch.tensor([[0.0, 1.0, 0.0], [-1.0, 0.0, 0.0], [0.0, 0.0, 1.0]]) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 38c2ba24..a442f980 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -147,7 +147,7 @@ def test_error_calculation(single_batch_with_batchsize): 1 ) # FIXME : fi reference_E_error = torch.mean(scale_squared_error) - assert torch.allclose(E_error, reference_E_error) + assert torch.allclose(torch.mean(E_error), reference_E_error) # test error for property with shape (nr_of_atoms, 3) error = FromPerAtomToPerMoleculeSquaredError() @@ -170,7 +170,7 @@ def test_error_calculation(single_batch_with_batchsize): reference_F_error = torch.mean( per_mol_error / (3 * data.metadata.atomic_subsystem_counts.unsqueeze(1)) ) - assert torch.allclose(F_error, reference_F_error) + assert torch.allclose(torch.mean(F_error), reference_F_error) def test_loss(single_batch_with_batchsize): From d93c82f701862028eadd2d7a9e9ed9d77c9f2742 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Mon, 26 Aug 2024 21:51:56 -0700 Subject: [PATCH 33/66] Addressing comments --- modelforge/potential/ani.py | 8 -------- modelforge/potential/models.py | 6 ------ 2 files changed, 14 deletions(-) diff --git a/modelforge/potential/ani.py b/modelforge/potential/ani.py index 32da49ae..67019c77 100644 --- a/modelforge/potential/ani.py +++ b/modelforge/potential/ani.py @@ -16,11 +16,6 @@ from modelforge.utils.prop import SpeciesAEV -# if TYPE_CHECKING: -# from modelforge.dataset.dataset import NNPInput -# from .models import PairListOutputs - - def triu_index(num_species: int) -> torch.Tensor: """ Generate a tensor representing the upper triangular indices for species pairs. @@ -110,7 +105,6 @@ def __init__( angle_sections: int, nr_of_supported_elements: int = 7, ): - super().__init__() from modelforge.potential.utils import CosineAttenuationFunction @@ -386,7 +380,6 @@ class ANIInteraction(nn.Module): """ def __init__(self, *, aev_dim: int, activation_function: Type[torch.nn.Module]): - super().__init__() # define atomic neural network atomic_neural_networks = self.intialize_atomic_neural_network( @@ -677,7 +670,6 @@ def __init__( dataset_statistic: Optional[Dict[str, float]] = None, potential_seed: Optional[int] = None, ) -> None: - from modelforge.utils.units import _convert_str_to_unit self.only_unique_pairs = True # NOTE: need to be set before super().__init__ diff --git a/modelforge/potential/models.py b/modelforge/potential/models.py index 27dcd4f2..dd6a5ae1 100644 --- a/modelforge/potential/models.py +++ b/modelforge/potential/models.py @@ -385,12 +385,6 @@ def forward( ) return interacting_outputs - # - # return PairListOutputs( - # pair_indices=pair_indices_within_cutoff, - # d_ij=d_ij[in_cutoff], - # r_ij=r_ij[in_cutoff], - # ) from typing import Callable, Literal, Optional, Union From 83bd33402dca39b40bf145960267bd3c1e056181 Mon Sep 17 00:00:00 2001 From: Marcus Wieder <31651017+wiederm@users.noreply.github.com> Date: Tue, 27 Aug 2024 08:38:05 +0200 Subject: [PATCH 34/66] Update test_env.yaml --- devtools/conda-envs/test_env.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index b045aecf..8e574780 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -8,6 +8,7 @@ dependencies: - pip - h5py - tqdm + - toml - qcportal>=0.50 - qcelemental - pytorch>=2.1 From 2c0b6436756c7995ce7a4c5728060e4c7acd2aa0 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 13:42:59 +0200 Subject: [PATCH 35/66] decorator for method locking --- modelforge/dataset/dataset.py | 43 +++++++++++++++-------- modelforge/utils/__init__.py | 1 + modelforge/utils/misc.py | 64 +++++++++++++++++++++++++++++++++-- 3 files changed, 91 insertions(+), 17 deletions(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index 8c78ab32..4ae48bf9 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -4,7 +4,7 @@ import os from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Literal, Optional, Union, NamedTuple +from typing import TYPE_CHECKING, Dict, List, Literal, NamedTuple, Optional, Union import numpy as np import pytorch_lightning as pl @@ -19,11 +19,10 @@ if TYPE_CHECKING: from modelforge.potential.processing import AtomicSelfEnergies - -from pydantic import BaseModel, field_validator, ConfigDict, Field - from enum import Enum +from pydantic import BaseModel, ConfigDict, Field, field_validator + class CaseInsensitiveEnum(str, Enum): @classmethod @@ -208,8 +207,9 @@ def as_jax_namedtuple(self) -> NamedTuple: """Export the dataclass fields and values as a named tuple. Convert pytorch tensors to jax arrays.""" - from dataclasses import dataclass, fields import collections + from dataclasses import dataclass, fields + from modelforge.utils.io import import_ convert_to_jax = import_("pytorch2jax").pytorch2jax.convert_to_jax @@ -1042,7 +1042,6 @@ def create_dataset( return TorchDataset(data.numpy_data, data._property_names) -from torch import nn from openff.units import unit @@ -1101,6 +1100,9 @@ def __init__( regenerate_cache : bool, defaults to False Whether to regenerate the cache. """ + from modelforge.potential.models import Pairlist + import os + super().__init__() self.name = name @@ -1117,28 +1119,41 @@ def __init__( self.train_dataset = None self.test_dataset = None self.val_dataset = None - import os # make sure we can handle a path with a ~ in it self.local_cache_dir = os.path.expanduser(local_cache_dir) self.regenerate_cache = regenerate_cache - from modelforge.potential.models import Pairlist self.pairlist = Pairlist() self.dataset_statistic_filename = ( f"{self.local_cache_dir}/{self.name}_dataset_statistic.toml" ) + self.cache_processed_dataset_filename = f"{self.local_cache_dir}/{self.name}_{self.version_select}processed_dataset.pt" def prepare_data( self, ) -> None: """ - Prepares the dataset for use. This method is responsible for the initial processing of the data such as calculating self energies, atomic energy statistics, and splitting. It is executed only once per node. + Prepares the dataset for use. This method is responsible for the initial + processing of the data such as calculating self energies, atomic energy + statistics, and splitting. It is executed only once per node. """ + # if the dataset has already been processed, skip this step + if ( + os.path.exists(self.cache_processed_dataset_filename) + and not self.regenerate_cache + ): + if not os.path.exists(self.dataset_statistic_filename): + raise FileNotFoundError( + f"Dataset statistics file {self.dataset_statistic_filename} not found. Please regenerate the cache." + ) + log.info('Processed dataset already exists. Skipping "prepare_data" step.') + return None + + # if the dataset is not already processed, process it from modelforge.dataset import _ImplementedDatasets - import toml - dataset_class = _ImplementedDatasets.get_dataset_class(self.name) + dataset_class = _ImplementedDatasets.get_dataset_class(str(self.name)) dataset = dataset_class( force_download=self.force_download, version_select=self.version_select, @@ -1284,11 +1299,11 @@ def _calculate_atomic_self_energies( def _cache_dataset(self, torch_dataset): """Cache the dataset and its statistics using PyTorch's serialization.""" - torch.save(torch_dataset, "torch_dataset.pt") - # sleep for 1 second to make sure that the dataset was written to disk + torch.save(torch_dataset, self.cache_processed_dataset_filename) + # sleep for 5 second to make sure that the dataset was written to disk import time - time.sleep(1) + time.sleep(5) def setup(self, stage: Optional[str] = None) -> None: """Sets up datasets for the train, validation, and test stages based on the stage argument.""" diff --git a/modelforge/utils/__init__.py b/modelforge/utils/__init__.py index 4e57d2f1..605aea59 100644 --- a/modelforge/utils/__init__.py +++ b/modelforge/utils/__init__.py @@ -1,3 +1,4 @@ """Module of general modelforge utilities.""" from .prop import SpeciesEnergies, PropertyNames +from .misc import lock_with_attribute diff --git a/modelforge/utils/misc.py b/modelforge/utils/misc.py index f56b2d8d..2a0e7280 100644 --- a/modelforge/utils/misc.py +++ b/modelforge/utils/misc.py @@ -2,15 +2,18 @@ Module of miscellaneous utilities. """ -from typing import Literal +from typing import Literal, TYPE_CHECKING import torch from loguru import logger -from modelforge.dataset.dataset import DataModule + +# import DataModule for typing hint +if TYPE_CHECKING: + from modelforge.dataset.dataset import DataModule def visualize_model( - dm: DataModule, + dm: 'DataModule', potential_name: Literal["ANI2x", "PhysNet", "SchNet", "PaiNN", "SAKE"], ): # visualize the compute graph @@ -314,3 +317,58 @@ def __exit__(self, *args): # fcntl.flock(self._file_handle.fileno(), fcntl.LOCK_UN) unlock_file(self._file_handle) self._file_handle.close() + + +import os +from functools import wraps + + +def lock_with_attribute(attribute_name): + """ + Decorator for locking a method using a lock file path stored in an instance + attribute. The attribute is accessed on the instance (`self`) at runtime. + + Parameters + ---------- + attribute_name : str + The name of the instance attribute that contains the lock file path. + + Examples + -------- + >>> from modelforge.utils.misc import lock_with_attribute + >>> + >>> class MyClass: + >>> def __init__(self, lock_file): + >>> self.method_lock = lock_file + >>> + >>> @lock_with_attribute('method_lock') + >>> def critical_section(self): + >>> print("Executing critical section") + """ + + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + # Retrieve the instance (`self`) + instance = args[0] + # Get the lock file path from the specified attribute + lock_file_path = getattr(instance, attribute_name) + with open(lock_file_path, 'w') as lock_file: + # Lock the file + lock_file(lock_file) + + try: + # Execute the wrapped function + result = func(*args, **kwargs) + finally: + # Unlock the file + unlock_file(lock_file) + + # Optionally, remove the lock file + os.remove(lock_file_path) + + return result + + return wrapper + + return decorator From 992b7bc4b10169db10267f5cb0933ae006055255 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 13:43:09 +0200 Subject: [PATCH 36/66] lock --- modelforge/dataset/dataset.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index 4ae48bf9..433138de 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -15,13 +15,14 @@ from modelforge.dataset.utils import RandomRecordSplittingStrategy, SplittingStrategy from modelforge.utils.prop import PropertyNames +from modelforge.utils.misc import lock_with_attribute if TYPE_CHECKING: from modelforge.potential.processing import AtomicSelfEnergies from enum import Enum -from pydantic import BaseModel, ConfigDict, Field, field_validator +from pydantic import BaseModel, ConfigDict, Field class CaseInsensitiveEnum(str, Enum): @@ -1045,6 +1046,7 @@ def create_dataset( from openff.units import unit + class DataModule(pl.LightningDataModule): def __init__( self, @@ -1128,8 +1130,12 @@ def __init__( self.dataset_statistic_filename = ( f"{self.local_cache_dir}/{self.name}_dataset_statistic.toml" ) - self.cache_processed_dataset_filename = f"{self.local_cache_dir}/{self.name}_{self.version_select}processed_dataset.pt" + self.cache_processed_dataset_filename = ( + f"{self.local_cache_dir}/{self.name}_{self.version_select}_processed.pt" + ) + self.lock_file = f"{self.cache_processed_dataset_filename}.lockfile" + @lock_with_attribute("lock_file") def prepare_data( self, ) -> None: @@ -1138,6 +1144,8 @@ def prepare_data( processing of the data such as calculating self energies, atomic energy statistics, and splitting. It is executed only once per node. """ + # check if there is a filelock present, if so, wait until it is removed + # if the dataset has already been processed, skip this step if ( os.path.exists(self.cache_processed_dataset_filename) @@ -1151,6 +1159,7 @@ def prepare_data( return None # if the dataset is not already processed, process it + from modelforge.dataset import _ImplementedDatasets dataset_class = _ImplementedDatasets.get_dataset_class(str(self.name)) @@ -1161,7 +1170,6 @@ def prepare_data( regenerate_cache=self.regenerate_cache, ) torch_dataset = self._create_torch_dataset(dataset) - # if dataset statistics is present load it from disk if ( os.path.exists(self.dataset_statistic_filename) @@ -1308,7 +1316,7 @@ def _cache_dataset(self, torch_dataset): def setup(self, stage: Optional[str] = None) -> None: """Sets up datasets for the train, validation, and test stages based on the stage argument.""" - self.torch_dataset = torch.load("torch_dataset.pt") + self.torch_dataset = torch.load(self.cache_processed_dataset_filename) ( self.train_dataset, self.val_dataset, From b5f9888d7a02a4deb5688146f2d0e5c01a777cd9 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 13:56:34 +0200 Subject: [PATCH 37/66] typo --- modelforge/utils/misc.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/modelforge/utils/misc.py b/modelforge/utils/misc.py index 2a0e7280..ea288a4a 100644 --- a/modelforge/utils/misc.py +++ b/modelforge/utils/misc.py @@ -13,7 +13,7 @@ def visualize_model( - dm: 'DataModule', + dm: "DataModule", potential_name: Literal["ANI2x", "PhysNet", "SchNet", "PaiNN", "SAKE"], ): # visualize the compute graph @@ -353,16 +353,16 @@ def wrapper(*args, **kwargs): instance = args[0] # Get the lock file path from the specified attribute lock_file_path = getattr(instance, attribute_name) - with open(lock_file_path, 'w') as lock_file: + with open(lock_file_path, "w") as f: # Lock the file - lock_file(lock_file) + lock_file(f) try: # Execute the wrapped function result = func(*args, **kwargs) finally: # Unlock the file - unlock_file(lock_file) + unlock_file(f) # Optionally, remove the lock file os.remove(lock_file_path) From acd06fee31ddcffda3f59e3ad36a80048c7e3d4e Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 15:37:55 +0200 Subject: [PATCH 38/66] correct lock file mode --- modelforge/utils/misc.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelforge/utils/misc.py b/modelforge/utils/misc.py index ea288a4a..f4e918f2 100644 --- a/modelforge/utils/misc.py +++ b/modelforge/utils/misc.py @@ -353,7 +353,7 @@ def wrapper(*args, **kwargs): instance = args[0] # Get the lock file path from the specified attribute lock_file_path = getattr(instance, attribute_name) - with open(lock_file_path, "w") as f: + with open(lock_file_path, "w+") as f: # Lock the file lock_file(f) From 62003b7815461670dce32cab839630c62db9f571 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 15:40:00 +0200 Subject: [PATCH 39/66] linting --- modelforge/dataset/dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index 433138de..4cfd8aa0 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -1046,7 +1046,6 @@ def create_dataset( from openff.units import unit - class DataModule(pl.LightningDataModule): def __init__( self, From e48355191ba99634704d1666aa181c1bfd65a051 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 15:50:12 +0200 Subject: [PATCH 40/66] fix test failures --- modelforge/tests/conftest.py | 2 ++ modelforge/tests/test_training.py | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py index f7ccd939..ad519b40 100644 --- a/modelforge/tests/conftest.py +++ b/modelforge/tests/conftest.py @@ -51,6 +51,7 @@ def initialize_datamodule( remove_self_energies: bool = True, regression_ase: bool = False, regenerate_dataset_statistic: bool = False, + regenerate_cache: bool = True, ) -> DataModule: """ Initialize a dataset for a given mode. @@ -64,6 +65,7 @@ def initialize_datamodule( remove_self_energies=remove_self_energies, regression_ase=regression_ase, regenerate_dataset_statistic=regenerate_dataset_statistic, + regenerate_cache=regenerate_cache, ) data_module.prepare_data() data_module.setup() diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index a442f980..9393e858 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -84,7 +84,7 @@ def get_trainer(potential_name: str, dataset_name: str, training_toml: str): @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) -@pytest.mark.parametrize("dataset_name", ["QM9", "SPICE2"]) +@pytest.mark.parametrize("dataset_name", ["PHALKETHOH"]) @pytest.mark.parametrize("training", ["with_force", "without_force"]) def test_train_with_lightning(training, potential_name, dataset_name): """ From c2cfb5f9b5cf95f73ad8a0e3feb85ed22764fa2f Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 17:30:17 +0200 Subject: [PATCH 41/66] reasonable defaults for regeneration --- modelforge/dataset/dataset.py | 15 ++++++++++++++- modelforge/tests/conftest.py | 4 ++-- modelforge/tests/test_dataset.py | 1 + modelforge/train/training.py | 1 + 4 files changed, 18 insertions(+), 3 deletions(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index 4cfd8aa0..cb2e9a0e 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -64,6 +64,7 @@ class DatasetParameters(BaseModel): version_select: str num_workers: int = Field(gt=0) pin_memory: bool + regenerate_processed_cache: bool = False @dataclass(frozen=False) @@ -474,6 +475,7 @@ def __init__( local_cache_dir: str, force_download: bool = False, regenerate_cache: bool = False, + regenerate_processed_cache: bool = False, ): """ Initializes the HDF5Dataset with paths to raw and processed data files. @@ -504,6 +506,11 @@ def __init__( self.force_download = force_download self.regenerate_cache = regenerate_cache + # is True if regenerate_cache is True + self.regenerate_processed_cache = ( + regenerate_processed_cache or self.regenerate_cache + ) + self.hdf5data: Optional[Dict[str, List[np.ndarray]]] = None self.numpy_data: Optional[np.ndarray] = None @@ -1068,6 +1075,7 @@ def __init__( local_cache_dir: str = "./", regenerate_cache: bool = False, regenerate_dataset_statistic: bool = False, + regenerate_processed_cache: bool = True, ): """ Initializes adData module for PyTorch Lightning handling data preparation and loading object with the specified configuration. @@ -1124,6 +1132,11 @@ def __init__( # make sure we can handle a path with a ~ in it self.local_cache_dir = os.path.expanduser(local_cache_dir) self.regenerate_cache = regenerate_cache + # Use a logical OR to ensure regenerate_processed_cache is True when + # regenerate_cache is True + self.regenerate_processed_cache = ( + regenerate_processed_cache or self.regenerate_cache + ) self.pairlist = Pairlist() self.dataset_statistic_filename = ( @@ -1148,7 +1161,7 @@ def prepare_data( # if the dataset has already been processed, skip this step if ( os.path.exists(self.cache_processed_dataset_filename) - and not self.regenerate_cache + and not self.regenerate_processed_cache ): if not os.path.exists(self.dataset_statistic_filename): raise FileNotFoundError( diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py index ad519b40..b7c0c48f 100644 --- a/modelforge/tests/conftest.py +++ b/modelforge/tests/conftest.py @@ -51,7 +51,7 @@ def initialize_datamodule( remove_self_energies: bool = True, regression_ase: bool = False, regenerate_dataset_statistic: bool = False, - regenerate_cache: bool = True, + regenerate_processed_cache: bool = True, ) -> DataModule: """ Initialize a dataset for a given mode. @@ -65,7 +65,7 @@ def initialize_datamodule( remove_self_energies=remove_self_energies, regression_ase=regression_ase, regenerate_dataset_statistic=regenerate_dataset_statistic, - regenerate_cache=regenerate_cache, + regenerate_processed_cache=regenerate_processed_cache, ) data_module.prepare_data() data_module.setup() diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py index 91f963a3..de15775a 100644 --- a/modelforge/tests/test_dataset.py +++ b/modelforge/tests/test_dataset.py @@ -730,6 +730,7 @@ def test_energy_postprocessing(): splitting_strategy=FirstComeFirstServeSplittingStrategy(), remove_self_energies=True, regenerate_dataset_statistic=True, + regenerate_processed_cache=True, ) dm.prepare_data() dm.setup() diff --git a/modelforge/train/training.py b/modelforge/train/training.py index c9d36ec9..254e2acf 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -1037,6 +1037,7 @@ def setup_datamodule(self) -> DataModule: seed=self.training_parameter.splitting_strategy.seed, split=self.training_parameter.splitting_strategy.data_split, ), + regenerate_processed_cache=self.dataset_parameter.regenerate_processed_cache, ) dm.prepare_data() dm.setup() From a1daf2cd0853cc0f924120ab3a2b96fcfda065b9 Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 17:50:30 +0200 Subject: [PATCH 42/66] ha, didn't think about that --- modelforge/dataset/dataset.py | 8 ++------ modelforge/tests/conftest.py | 2 -- modelforge/tests/test_dataset.py | 1 - 3 files changed, 2 insertions(+), 9 deletions(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index cb2e9a0e..7a7944d6 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -475,7 +475,6 @@ def __init__( local_cache_dir: str, force_download: bool = False, regenerate_cache: bool = False, - regenerate_processed_cache: bool = False, ): """ Initializes the HDF5Dataset with paths to raw and processed data files. @@ -506,11 +505,6 @@ def __init__( self.force_download = force_download self.regenerate_cache = regenerate_cache - # is True if regenerate_cache is True - self.regenerate_processed_cache = ( - regenerate_processed_cache or self.regenerate_cache - ) - self.hdf5data: Optional[Dict[str, List[np.ndarray]]] = None self.numpy_data: Optional[np.ndarray] = None @@ -1131,6 +1125,8 @@ def __init__( # make sure we can handle a path with a ~ in it self.local_cache_dir = os.path.expanduser(local_cache_dir) + # create the local cache directory if it does not exist + os.makedirs(self.local_cache_dir, exist_ok=True) self.regenerate_cache = regenerate_cache # Use a logical OR to ensure regenerate_processed_cache is True when # regenerate_cache is True diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py index b7c0c48f..f7ccd939 100644 --- a/modelforge/tests/conftest.py +++ b/modelforge/tests/conftest.py @@ -51,7 +51,6 @@ def initialize_datamodule( remove_self_energies: bool = True, regression_ase: bool = False, regenerate_dataset_statistic: bool = False, - regenerate_processed_cache: bool = True, ) -> DataModule: """ Initialize a dataset for a given mode. @@ -65,7 +64,6 @@ def initialize_datamodule( remove_self_energies=remove_self_energies, regression_ase=regression_ase, regenerate_dataset_statistic=regenerate_dataset_statistic, - regenerate_processed_cache=regenerate_processed_cache, ) data_module.prepare_data() data_module.setup() diff --git a/modelforge/tests/test_dataset.py b/modelforge/tests/test_dataset.py index de15775a..91f963a3 100644 --- a/modelforge/tests/test_dataset.py +++ b/modelforge/tests/test_dataset.py @@ -730,7 +730,6 @@ def test_energy_postprocessing(): splitting_strategy=FirstComeFirstServeSplittingStrategy(), remove_self_energies=True, regenerate_dataset_statistic=True, - regenerate_processed_cache=True, ) dm.prepare_data() dm.setup() From 2bc8c3cb62bedd95393a487a73ccaee656bb62ef Mon Sep 17 00:00:00 2001 From: wiederm Date: Tue, 27 Aug 2024 21:31:55 +0200 Subject: [PATCH 43/66] update parameter name --- docs/getting_started.rst | 2 +- modelforge/tests/data/config.toml | 2 +- .../tests/data/training_defaults/default.toml | 2 +- .../training_defaults/default_with_force.toml | 2 +- modelforge/train/parameters.py | 3 ++- modelforge/train/training.py | 22 ++----------------- scripts/config.toml | 2 +- 7 files changed, 9 insertions(+), 26 deletions(-) diff --git a/docs/getting_started.rst b/docs/getting_started.rst index c533e47f..0dc1b6bd 100644 --- a/docs/getting_started.rst +++ b/docs/getting_started.rst @@ -149,7 +149,7 @@ Here is an example of a training routine definition: remove_self_energies = true # Whether to remove self-energies from the dataset batch_size = 128 # Number of samples per batch lr = 1e-3 # Learning rate for the optimizer - monitor = "val/per_molecule_energy/rmse" # Metric to monitor for early stopping and checkpointing + monitor_for_checkpoint = "val/per_molecule_energy/rmse" # Metric to monitor for checkpointing [training.experiment_logger] diff --git a/modelforge/tests/data/config.toml b/modelforge/tests/data/config.toml index c323c354..53e408cf 100644 --- a/modelforge/tests/data/config.toml +++ b/modelforge/tests/data/config.toml @@ -35,7 +35,7 @@ number_of_epochs = 2 remove_self_energies = true batch_size = 128 lr = 1e-3 -monitor = "val/per_molecule_energy/rmse" +monitor_for_checkpoint = "val/per_molecule_energy/rmse" [training.experiment_logger] logger_name = "tensorboard" diff --git a/modelforge/tests/data/training_defaults/default.toml b/modelforge/tests/data/training_defaults/default.toml index 1e4025a9..c8d96fc2 100644 --- a/modelforge/tests/data/training_defaults/default.toml +++ b/modelforge/tests/data/training_defaults/default.toml @@ -3,7 +3,7 @@ number_of_epochs = 2 remove_self_energies = true batch_size = 128 lr = 1e-3 -monitor = "val/per_molecule_energy/rmse" +monitor_for_checkpoint = "val/per_molecule_energy/rmse" [training.experiment_logger] diff --git a/modelforge/tests/data/training_defaults/default_with_force.toml b/modelforge/tests/data/training_defaults/default_with_force.toml index 19c0d3b7..4ba07f8b 100644 --- a/modelforge/tests/data/training_defaults/default_with_force.toml +++ b/modelforge/tests/data/training_defaults/default_with_force.toml @@ -3,7 +3,7 @@ number_of_epochs = 2 remove_self_energies = true batch_size = 128 lr = 1e-3 -monitor = "val/per_molecule_energy/rmse" +monitor_for_checkpoint = "val/per_molecule_energy/rmse" [training.experiment_logger] diff --git a/modelforge/train/parameters.py b/modelforge/train/parameters.py index 1ea820d7..f6daf0d2 100644 --- a/modelforge/train/parameters.py +++ b/modelforge/train/parameters.py @@ -262,7 +262,7 @@ def ensure_logger_configuration(self) -> "ExperimentLogger": remove_self_energies: bool batch_size: int lr: float - monitor: str + monitor_for_checkpoint: str lr_scheduler: Optional[SchedulerConfig] = None loss_parameter: LossParameter early_stopping: Optional[EarlyStopping] = None @@ -271,6 +271,7 @@ def ensure_logger_configuration(self) -> "ExperimentLogger": experiment_logger: ExperimentLogger verbose: bool = False optimizer: Type[torch.optim.Optimizer] = torch.optim.AdamW + min_number_of_epochs: Union[int, None] = None ### Runtime Parameters diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 254e2acf..3829c9e3 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -387,25 +387,6 @@ def create_error_metrics(loss_properties: List[str], loss: bool = False) -> Modu return metric_dict -from torchmetrics import Metric - - -class MeanLossMetric(Metric): - def __init__(self): - super().__init__() - self.add_state("sum_loss", default=torch.tensor(0.0), dist_reduce_fx="sum") - self.add_state("total_batches", default=torch.tensor(0), dist_reduce_fx="sum") - - def update(self, loss: torch.Tensor, batch_size: int): - # Accumulate the loss sum and batch count - self.sum_loss += loss.sum() - self.total_batches += batch_size - - def compute(self): - # Compute the mean loss - return self.sum_loss / self.total_batches - - class CalculateProperties(torch.nn.Module): def __init__(self, requested_properties: List[str]): @@ -1147,7 +1128,7 @@ def setup_callbacks(self) -> List[Any]: ) checkpoint_callback = ModelCheckpoint( save_top_k=2, - monitor=self.training_parameter.monitor, + monitor=self.training_parameter.monitor_for_checkpoint, filename=checkpoint_filename, ) callbacks.append(checkpoint_callback) @@ -1170,6 +1151,7 @@ def setup_trainer(self) -> Trainer: trainer = Trainer( max_epochs=self.training_parameter.number_of_epochs, + min_epochs=self.training_parameter.min_number_of_epochs, num_nodes=self.runtime_parameter.number_of_nodes, devices=self.runtime_parameter.devices, accelerator=self.runtime_parameter.accelerator, diff --git a/scripts/config.toml b/scripts/config.toml index d6e92a10..b49e1434 100644 --- a/scripts/config.toml +++ b/scripts/config.toml @@ -33,7 +33,7 @@ number_of_epochs = 1000 remove_self_energies = true batch_size = 16 lr = 0.5e-3 -monitor = "val/per_molecule_energy/rmse" +monitor_for_checkpoint = "val/per_molecule_energy/rmse" [training.experiment_logger] logger_name = "tensorboard" From 225bad41ea7ddf6a6c274e1af2599b3523d660de Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Wed, 28 Aug 2024 20:33:32 -0700 Subject: [PATCH 44/66] enable removing of high energy states from phalkethoh dataset --- modelforge/curation/phalkethoh_curation.py | 201 ++++++++++-------- .../curation/scripts/curate_PhAlkEthOH.py | 9 +- modelforge/dataset/dataset.py | 2 + 3 files changed, 122 insertions(+), 90 deletions(-) diff --git a/modelforge/curation/phalkethoh_curation.py b/modelforge/curation/phalkethoh_curation.py index 600ef8ca..21c5e346 100644 --- a/modelforge/curation/phalkethoh_curation.py +++ b/modelforge/curation/phalkethoh_curation.py @@ -267,7 +267,7 @@ def _calculate_total_charge( rdmol = Chem.MolFromSmiles(smiles, sanitize=False) total_charge = sum(atom.GetFormalCharge() for atom in rdmol.GetAtoms()) - return (int(total_charge) * unit.elementary_charge,) + return int(total_charge) * unit.elementary_charge def _process_downloaded( self, @@ -277,6 +277,7 @@ def _process_downloaded( max_conformers_per_record: Optional[int] = None, total_conformers: Optional[int] = None, atomic_numbers_to_limit: Optional[List[int]] = None, + max_force: Optional[unit.Quantity] = None, ): """ Processes a downloaded dataset: extracts relevant information. @@ -295,6 +296,8 @@ def _process_downloaded( If set, this will limit the total number of conformers to the specified number. atomic_numbers_to_limit: Optional[List[int]], optional, default=None If set, this will limit the dataset to only include molecules with atomic numbers in the list. + max_force: Optional[float], optional, default=None + If set, this will exclude any conformers with a force that exceeds this value. """ from tqdm import tqdm import numpy as np @@ -358,7 +361,7 @@ def _process_downloaded( ] data_temp["n_configs"] = 0 - (data_temp["total_charge"],) = self._calculate_total_charge( + data_temp["total_charge"] = self._calculate_total_charge( data_temp[ "canonical_isomeric_explicit_hydrogen_mapped_smiles" ] @@ -378,104 +381,120 @@ def _process_downloaded( index = self.molecule_names[name] for state in trajectory: + add_record = True properties, config = state - self.data[index]["n_configs"] += 1 - - # note, we will use the convention of names being lowercase - # and spaces denoted by underscore - quantity = "geometry" - quantity_o = "geometry" - if quantity_o not in self.data[index].keys(): - self.data[index][quantity_o] = config.reshape(1, -1, 3) - else: - self.data[index][quantity_o] = np.vstack( - ( - self.data[index][quantity_o], - config.reshape(1, -1, 3), + + # if set, let us see if the configuration has a force that exceeds the maximum + if max_force is not None: + force_magnitude = ( + np.abs( + properties["properties"]["current gradient"] + + properties["properties"][ + "dispersion correction gradient" + ] ) + * self.qm_parameters["dft_total_force"]["u_in"] ) + if np.any(force_magnitude > max_force): + add_record = False + if add_record: + self.data[index]["n_configs"] += 1 + + # note, we will use the convention of names being lowercase + # and spaces denoted by underscore + quantity = "geometry" + quantity_o = "geometry" + if quantity_o not in self.data[index].keys(): + self.data[index][quantity_o] = config.reshape(1, -1, 3) + else: + self.data[index][quantity_o] = np.vstack( + ( + self.data[index][quantity_o], + config.reshape(1, -1, 3), + ) + ) - # note, we will use the convention of names being lowercase - # and spaces denoted by underscore - quantity = "current energy" - quantity_o = "dft_total_energy" - if quantity_o not in self.data[index].keys(): - self.data[index][quantity_o] = properties["properties"][ - quantity - ] - else: - self.data[index][quantity_o] = np.vstack( - ( - self.data[index][quantity_o], - properties["properties"][quantity], + # note, we will use the convention of names being lowercase + # and spaces denoted by underscore + quantity = "current energy" + quantity_o = "dft_total_energy" + if quantity_o not in self.data[index].keys(): + self.data[index][quantity_o] = properties["properties"][ + quantity + ] + else: + self.data[index][quantity_o] = np.vstack( + ( + self.data[index][quantity_o], + properties["properties"][quantity], + ) ) - ) - quantity = "dispersion correction energy" - quantity_o = "dispersion_correction_energy" - # Note need to typecast here because of a bug in the - # qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766 - if quantity_o not in self.data[index].keys(): - self.data[index][quantity_o] = np.array( - float(properties["properties"][quantity]) - ).reshape(1, 1) - else: - self.data[index][quantity_o] = np.vstack( - ( - self.data[index][quantity_o], - np.array( - float(properties["properties"][quantity]) - ).reshape(1, 1), - ), - ) + quantity = "dispersion correction energy" + quantity_o = "dispersion_correction_energy" + # Note need to typecast here because of a bug in the + # qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766 + if quantity_o not in self.data[index].keys(): + self.data[index][quantity_o] = np.array( + float(properties["properties"][quantity]) + ).reshape(1, 1) + else: + self.data[index][quantity_o] = np.vstack( + ( + self.data[index][quantity_o], + np.array( + float(properties["properties"][quantity]) + ).reshape(1, 1), + ), + ) - quantity = "current gradient" - quantity_o = "dft_total_gradient" - if quantity_o not in self.data[index].keys(): - self.data[index][quantity_o] = np.array( - properties["properties"][quantity] - ).reshape(1, -1, 3) - else: - self.data[index][quantity_o] = np.vstack( - ( - self.data[index][quantity_o], - np.array( - properties["properties"][quantity] - ).reshape(1, -1, 3), + quantity = "current gradient" + quantity_o = "dft_total_gradient" + if quantity_o not in self.data[index].keys(): + self.data[index][quantity_o] = np.array( + properties["properties"][quantity] + ).reshape(1, -1, 3) + else: + self.data[index][quantity_o] = np.vstack( + ( + self.data[index][quantity_o], + np.array( + properties["properties"][quantity] + ).reshape(1, -1, 3), + ) ) - ) - quantity = "dispersion correction gradient" - quantity_o = "dispersion_correction_gradient" - if quantity_o not in self.data[index].keys(): - self.data[index][quantity_o] = np.array( - properties["properties"][quantity] - ).reshape(1, -1, 3) - else: - self.data[index][quantity_o] = np.vstack( - ( - self.data[index][quantity_o], - np.array( - properties["properties"][quantity] - ).reshape(1, -1, 3), + quantity = "dispersion correction gradient" + quantity_o = "dispersion_correction_gradient" + if quantity_o not in self.data[index].keys(): + self.data[index][quantity_o] = np.array( + properties["properties"][quantity] + ).reshape(1, -1, 3) + else: + self.data[index][quantity_o] = np.vstack( + ( + self.data[index][quantity_o], + np.array( + properties["properties"][quantity] + ).reshape(1, -1, 3), + ) ) - ) - quantity = "scf dipole" - quantity_o = "scf_dipole" - if quantity_o not in self.data[index].keys(): - self.data[index][quantity_o] = np.array( - properties["properties"][quantity] - ).reshape(1, 3) - else: - self.data[index][quantity_o] = np.vstack( - ( - self.data[index][quantity_o], - np.array( - properties["properties"][quantity] - ).reshape(1, 3), + quantity = "scf dipole" + quantity_o = "scf_dipole" + if quantity_o not in self.data[index].keys(): + self.data[index][quantity_o] = np.array( + properties["properties"][quantity] + ).reshape(1, 3) + else: + self.data[index][quantity_o] = np.vstack( + ( + self.data[index][quantity_o], + np.array( + properties["properties"][quantity] + ).reshape(1, 3), + ) ) - ) # assign units for datapoint in self.data: @@ -564,6 +583,7 @@ def process( max_conformers_per_record: Optional[int] = None, total_conformers: Optional[int] = None, limit_atomic_species: Optional[list] = None, + max_force: Optional[unit.Quantity] = None, n_threads=2, ) -> None: """ @@ -586,7 +606,9 @@ def process( Note defining this will only fetch from the "SPICE PubChem Set 1 Single Points Dataset v1.2" limit_atomic_species: Optional[list] = None, If set to a list of element symbols, records that contain any elements not in this list will be ignored. - n_threads, int, default=6 + max_force: Optional[float], optional, default=None + If set this any confirugrations with a force that exceeds this value will be excluded. + n_threads, int, default=2 Number of concurrent threads for retrieving data from QCArchive Examples -------- @@ -664,6 +686,7 @@ def process( max_conformers_per_record=max_conformers_per_record, total_conformers=total_conformers, atomic_numbers_to_limit=self.atomic_numbers_to_limit, + max_force=max_force, ) self._generate_hdf5() diff --git a/modelforge/curation/scripts/curate_PhAlkEthOH.py b/modelforge/curation/scripts/curate_PhAlkEthOH.py index 6dfab740..523464d2 100644 --- a/modelforge/curation/scripts/curate_PhAlkEthOH.py +++ b/modelforge/curation/scripts/curate_PhAlkEthOH.py @@ -20,6 +20,7 @@ def PhAlkEthOH_openff_wrapper( max_conformers_per_record=None, total_conformers=None, limit_atomic_species=None, + max_force=None, ): """ This curates and processes the SPICE 114 dataset at the OpenFF level of theory into an hdf5 file. @@ -49,7 +50,8 @@ def PhAlkEthOH_openff_wrapper( limit_atomic_species: list, optional, default=None A list of atomic species to limit the dataset to. Any molecules that contain elements outside of this list will be ignored. If not defined, no filtering by atomic species will be performed. - + max_force: float, optional, default=None + The maximum force to allow in the dataset. Any conformers with forces greater than this value will be ignored. """ from modelforge.curation.phalkethoh_curation import PhAlkEthOHCuration @@ -67,6 +69,7 @@ def PhAlkEthOH_openff_wrapper( total_conformers=total_conformers, limit_atomic_species=limit_atomic_species, n_threads=1, + max_force=max_force, ) print(f"Total records: {PhAlkEthOH_dataset.total_records}") print(f"Total conformers: {PhAlkEthOH_dataset.total_conformers}") @@ -74,6 +77,8 @@ def PhAlkEthOH_openff_wrapper( def main(): + from openff.units import unit + # define the location where to store and output the files import os @@ -99,6 +104,7 @@ def main(): max_records=1000, total_conformers=1000, max_conformers_per_record=10, + max_force=1.0 * unit.hartree / unit.bohr, ) # curate the full dataset @@ -110,6 +116,7 @@ def main(): local_cache_dir, force_download=False, version_select=version_select, + max_force=1.0 * unit.hartree / unit.bohr, ) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index 53142d86..07a80f69 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -488,6 +488,8 @@ def __init__( Directory to store the files. force_download : bool, optional If set to True, the data will be downloaded even if it already exists. Default is False. + regenerate_cache : bool, optional + If set to True, the cache file will be regenerated even if it already exists. Default is False. """ self.url = url self.gz_data_file = gz_data_file From d3539e5b24907df5027ecd84eace87ec8e8f0363 Mon Sep 17 00:00:00 2001 From: wiederm Date: Thu, 29 Aug 2024 16:39:25 +0200 Subject: [PATCH 45/66] detach metric --- modelforge/train/training.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modelforge/train/training.py b/modelforge/train/training.py index 3829c9e3..23ab1a68 100644 --- a/modelforge/train/training.py +++ b/modelforge/train/training.py @@ -709,10 +709,10 @@ def training_step(self, batch: "BatchData", batch_idx: int) -> torch.Tensor: # Update the loss metric with the different loss components for key, metric in loss_dict.items(): - self.loss_metric[key].update(metric, batch.batch_size()) + self.loss_metric[key].update(metric.clone().detach(), batch.batch_size()) - loss = torch.mean(loss_dict["total_loss"]).contiguous() - return loss + loss = torch.mean(loss_dict["total_loss"]) + return loss.contiguous() def validation_step(self, batch: "BatchData", batch_idx: int) -> None: """ From 904d3901b354d55c6a2a86455631be39cbee6dfe Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 29 Aug 2024 09:13:01 -0700 Subject: [PATCH 46/66] added v1 phalkethoh --- modelforge/curation/phalkethoh_curation.py | 10 +++++ .../curation/scripts/curate_PhAlkEthOH.py | 37 ++++++++++++++++++- modelforge/dataset/yaml_files/PhAlkEthOH.yaml | 15 ++++++++ 3 files changed, 60 insertions(+), 2 deletions(-) diff --git a/modelforge/curation/phalkethoh_curation.py b/modelforge/curation/phalkethoh_curation.py index 21c5e346..52d1ec2f 100644 --- a/modelforge/curation/phalkethoh_curation.py +++ b/modelforge/curation/phalkethoh_curation.py @@ -278,6 +278,7 @@ def _process_downloaded( total_conformers: Optional[int] = None, atomic_numbers_to_limit: Optional[List[int]] = None, max_force: Optional[unit.Quantity] = None, + final_conformer_only: Optional[bool] = None, ): """ Processes a downloaded dataset: extracts relevant information. @@ -298,6 +299,9 @@ def _process_downloaded( If set, this will limit the dataset to only include molecules with atomic numbers in the list. max_force: Optional[float], optional, default=None If set, this will exclude any conformers with a force that exceeds this value. + final_conformer_only: Optional[bool], optional, default=None + If set to True, only the final conformer of each record will be processed. This should be the final + energy minimized conformer. """ from tqdm import tqdm import numpy as np @@ -380,6 +384,8 @@ def _process_downloaded( name = key index = self.molecule_names[name] + if final_conformer_only: + trajectory = [trajectory[-1]] for state in trajectory: add_record = True properties, config = state @@ -584,6 +590,7 @@ def process( total_conformers: Optional[int] = None, limit_atomic_species: Optional[list] = None, max_force: Optional[unit.Quantity] = None, + final_conformer_only=None, n_threads=2, ) -> None: """ @@ -608,6 +615,8 @@ def process( If set to a list of element symbols, records that contain any elements not in this list will be ignored. max_force: Optional[float], optional, default=None If set this any confirugrations with a force that exceeds this value will be excluded. + final_conformer_only: Optional[bool], optional, default=None + If set to True, only the final conformer of each record will be processed. n_threads, int, default=2 Number of concurrent threads for retrieving data from QCArchive Examples @@ -687,6 +696,7 @@ def process( total_conformers=total_conformers, atomic_numbers_to_limit=self.atomic_numbers_to_limit, max_force=max_force, + final_conformer_only=final_conformer_only, ) self._generate_hdf5() diff --git a/modelforge/curation/scripts/curate_PhAlkEthOH.py b/modelforge/curation/scripts/curate_PhAlkEthOH.py index 523464d2..cecff1ff 100644 --- a/modelforge/curation/scripts/curate_PhAlkEthOH.py +++ b/modelforge/curation/scripts/curate_PhAlkEthOH.py @@ -21,6 +21,7 @@ def PhAlkEthOH_openff_wrapper( total_conformers=None, limit_atomic_species=None, max_force=None, + final_conformer_only=False, ): """ This curates and processes the SPICE 114 dataset at the OpenFF level of theory into an hdf5 file. @@ -52,6 +53,8 @@ def PhAlkEthOH_openff_wrapper( will be ignored. If not defined, no filtering by atomic species will be performed. max_force: float, optional, default=None The maximum force to allow in the dataset. Any conformers with forces greater than this value will be ignored. + final_conformer_only: bool, optional, default=False + If True, only the final conformer for each molecule will be processed. If False, all conformers will be processed. """ from modelforge.curation.phalkethoh_curation import PhAlkEthOHCuration @@ -70,6 +73,7 @@ def PhAlkEthOH_openff_wrapper( limit_atomic_species=limit_atomic_species, n_threads=1, max_force=max_force, + final_conformer_only=final_conformer_only, ) print(f"Total records: {PhAlkEthOH_dataset.total_records}") print(f"Total conformers: {PhAlkEthOH_dataset.total_conformers}") @@ -88,9 +92,9 @@ def main(): # We'll want to provide some simple means of versioning # if we make updates to either the underlying dataset, curation modules, or parameters given to the code - version = "0" + version = "1" # version of the dataset to curate - version_select = f"v_{version}" + version_select = f"v_0" # curate dataset with 1000 total conformers, max of 10 conformers per record hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000.hdf5" @@ -119,6 +123,35 @@ def main(): max_force=1.0 * unit.hartree / unit.bohr, ) + # curate dataset with 1000 total conformers, max of 10 conformers per record + hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000_minimal.hdf5" + + PhAlkEthOH_openff_wrapper( + hdf5_file_name, + output_file_dir, + local_cache_dir, + force_download=False, + version_select=version_select, + max_records=1000, + total_conformers=1000, + max_conformers_per_record=10, + max_force=1.0 * unit.hartree / unit.bohr, + final_conformer_only=True, + ) + + # curate the full dataset + hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_minimal.hdf5" + print("total dataset") + PhAlkEthOH_openff_wrapper( + hdf5_file_name, + output_file_dir, + local_cache_dir, + force_download=False, + version_select=version_select, + max_force=1.0 * unit.hartree / unit.bohr, + final_conformer_only=True, + ) + if __name__ == "__main__": main() diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml index 06ca005e..d4e96096 100644 --- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml +++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml @@ -1,6 +1,21 @@ dataset: PhAlkEthOH latest: full_dataset_v0 latest_test: nc_1000_v0 +full_dataset_v1: + version: 1 + doi: 10.5281/zenodo.13450735 + notes: removes high force conformers + gz_data_file: + length: 3300668359 + md5: b051af374f3233e2925f7a1b96707772 + name: PhAlkEthOH_dataset_v0.hdf5.gz + hdf5_data_file: + md5: f5d9dccb8e79a51892b671108bc57bde + name: PhAlkEthOH_dataset_v1.hdf5 + processed_data_file: + md5: null + name: PhAlkEthOH_dataset_v1_processed.npz + url: https://zenodo.org/records/13450735/files/PhAlkEthOH_openff_dataset_v1.hdf5.gz full_dataset_v0: version: 0 doi: 10.5281/zenodo.12174233 From 77a847772b98fc287aa19af25c722de4d1876558 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 29 Aug 2024 18:39:11 -0700 Subject: [PATCH 47/66] additional dataset versions added. --- modelforge/dataset/yaml_files/PhAlkEthOH.yaml | 33 +++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml index d4e96096..2f59df24 100644 --- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml +++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml @@ -1,6 +1,6 @@ dataset: PhAlkEthOH latest: full_dataset_v0 -latest_test: nc_1000_v0 +latest_test: nc_1000_v1 full_dataset_v1: version: 1 doi: 10.5281/zenodo.13450735 @@ -8,7 +8,7 @@ full_dataset_v1: gz_data_file: length: 3300668359 md5: b051af374f3233e2925f7a1b96707772 - name: PhAlkEthOH_dataset_v0.hdf5.gz + name: PhAlkEthOH_dataset_v1.hdf5.gz hdf5_data_file: md5: f5d9dccb8e79a51892b671108bc57bde name: PhAlkEthOH_dataset_v1.hdf5 @@ -16,6 +16,35 @@ full_dataset_v1: md5: null name: PhAlkEthOH_dataset_v1_processed.npz url: https://zenodo.org/records/13450735/files/PhAlkEthOH_openff_dataset_v1.hdf5.gz +nc_1000_v1: + version: 1 + doi: 10.5281/zenodo.13560343 + gz_data_file: + length: 2702091 + md5: 76b421802bef68f858757dba41f3ea2e + name: PhAlkEthOH_dataset_v1_nc_1000.hdf5.gz + hdf5_data_file: + md5: 244eb8d1b3547b8da229fd1507fb4d4e + name: PhAlkEthOH_dataset_v1_nc_1000.hdf5 + processed_data_file: + md5: null + name: PhAlkEthOH_dataset_v1_nc_1000_processed.npz + url: https://zenodo.org/records/13560343/files/PhAlkEthOH_openff_dataset_v1_ntc_1000.hdf5.gz +full_dataset_min_v1: + version: 1 + doi: 10.5281/zenodo.13561100 + notes: removes high force configurations, only contains final optimized configuration + gz_data_file: + length: 31352642 + md5: 205b0b7bc1858b1d3745480d9a29a770 + name: PhAlkEthOH_dataset_v1_min.hdf5.gz + hdf5_data_file: + md5: 41cb40718f8872baa6c468ab08574d46 + name: PhAlkEthOH_dataset_v1_min.hdf5 + processed_data_file: + md5: null + name: PhAlkEthOH_dataset_v1_min_processed.npz + url: https://zenodo.org/records/13561100/files/PhAlkEthOH_openff_dataset_v1_min.hdf5.gz full_dataset_v0: version: 0 doi: 10.5281/zenodo.12174233 From 05e29f2aabddf7bbf4c46c5e299e6c7806791067 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 29 Aug 2024 22:46:06 -0700 Subject: [PATCH 48/66] additional dataset versions added. --- modelforge/dataset/yaml_files/PhAlkEthOH.yaml | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml index 2f59df24..568e6c6b 100644 --- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml +++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml @@ -19,6 +19,7 @@ full_dataset_v1: nc_1000_v1: version: 1 doi: 10.5281/zenodo.13560343 + notes: removes high force conformers, 1000 conformers, max 10 per molecule gz_data_file: length: 2702091 md5: 76b421802bef68f858757dba41f3ea2e @@ -45,6 +46,21 @@ full_dataset_min_v1: md5: null name: PhAlkEthOH_dataset_v1_min_processed.npz url: https://zenodo.org/records/13561100/files/PhAlkEthOH_openff_dataset_v1_min.hdf5.gz +nc_1000_min_v1: + version: 1 + doi: 10.5281/zenodo.13576458 + notes: removes high force conformers, 1000 conformers, only contains final optimized configuration + gz_data_file: + length: 3476870 + md5: 7261f4738efd4bf8409268961837ba78 + name: PhAlkEthOH_dataset_v1_nc_1000_min.hdf5.gz + hdf5_data_file: + md5: 5d347a78c6c3b45531870a05d5aab77e + name: PhAlkEthOH_dataset_v1_nc_1000_min.hdf5 + processed_data_file: + md5: null + name: PhAlkEthOH_dataset_v1_nc_1000_min_processed.npz + url: https://zenodo.org/records/13576458/files/PhAlkEthOH_openff_dataset_v1_ntc_1000_min.hdf5.gz full_dataset_v0: version: 0 doi: 10.5281/zenodo.12174233 From ce41a4977818901a4b96d6619997ebb9391c1e0a Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Thu, 29 Aug 2024 23:50:37 -0700 Subject: [PATCH 49/66] updated "latest" --- modelforge/dataset/yaml_files/PhAlkEthOH.yaml | 2 +- modelforge/tests/data/dataset_defaults/phalkethoh.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml index 568e6c6b..7d8d186a 100644 --- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml +++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml @@ -1,5 +1,5 @@ dataset: PhAlkEthOH -latest: full_dataset_v0 +latest: full_dataset_v1 latest_test: nc_1000_v1 full_dataset_v1: version: 1 diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml index 60281c0c..436ad852 100644 --- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml +++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml @@ -1,5 +1,5 @@ [dataset] dataset_name = "PHALKETHOH" -version_select = "nc_1000_v0" +version_select = "full_dataset_min_v1" num_workers = 4 pin_memory = true \ No newline at end of file From 031205caa47ad0680e1fcbab6cd8c7b600ed0727 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 30 Aug 2024 10:04:24 -0700 Subject: [PATCH 50/66] updated "latest" --- modelforge/tests/data/dataset_defaults/phalkethoh.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml index 436ad852..ddac0202 100644 --- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml +++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml @@ -1,5 +1,5 @@ [dataset] dataset_name = "PHALKETHOH" -version_select = "full_dataset_min_v1" +version_select = "nc_1000_v1" num_workers = 4 pin_memory = true \ No newline at end of file From 7c374a592342fbd42d2e44d370d0a40aa3051ed0 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 30 Aug 2024 17:04:56 -0700 Subject: [PATCH 51/66] Having test_train_lightning skip sake+forces when on github CI because it allocates too much money --- modelforge/tests/data/dataset_defaults/phalkethoh.toml | 2 +- modelforge/tests/test_training.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml index 60281c0c..ddac0202 100644 --- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml +++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml @@ -1,5 +1,5 @@ [dataset] dataset_name = "PHALKETHOH" -version_select = "nc_1000_v0" +version_select = "nc_1000_v1" num_workers = 4 pin_memory = true \ No newline at end of file diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 9393e858..937d2571 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -95,7 +95,10 @@ def test_train_with_lightning(training, potential_name, dataset_name): # SKIP if potential is ANI and dataset is SPICE2 if "ANI" in potential_name and dataset_name == "SPICE2": pytest.skip("ANI potential is not compatible with SPICE2 dataset") - + if IN_GITHUB_ACTIONS and potential_name == "sake" and training == "with_force": + pytest.skip( + "Skipping Phalkethoh with sake training with forces on GitHub Actions because it allocates too much memory" + ) # train potential get_trainer( potential_name, dataset_name, training_toml From 2ca2b0af5ae1298c6c8b6487c7da0d33b74161c8 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Fri, 30 Aug 2024 22:57:59 -0700 Subject: [PATCH 52/66] fixed captitalization error, test should now skip --- modelforge/tests/data/dataset_defaults/phalkethoh.toml | 2 +- modelforge/tests/test_training.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml index ddac0202..60281c0c 100644 --- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml +++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml @@ -1,5 +1,5 @@ [dataset] dataset_name = "PHALKETHOH" -version_select = "nc_1000_v1" +version_select = "nc_1000_v0" num_workers = 4 pin_memory = true \ No newline at end of file diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 937d2571..2b862b88 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -95,9 +95,9 @@ def test_train_with_lightning(training, potential_name, dataset_name): # SKIP if potential is ANI and dataset is SPICE2 if "ANI" in potential_name and dataset_name == "SPICE2": pytest.skip("ANI potential is not compatible with SPICE2 dataset") - if IN_GITHUB_ACTIONS and potential_name == "sake" and training == "with_force": + if potential_name == "SAKE" and training == "with_force": pytest.skip( - "Skipping Phalkethoh with sake training with forces on GitHub Actions because it allocates too much memory" + "Skipping Sake training with forces on GitHub Actions because it allocates too much memory" ) # train potential get_trainer( From daaedff5c7f8c3cc3a45b62581fd3f0cc684d864 Mon Sep 17 00:00:00 2001 From: chrisiacovella Date: Sat, 31 Aug 2024 00:49:19 -0700 Subject: [PATCH 53/66] in testing I removed the check to see if in github actions. This is resolved. --- modelforge/tests/test_training.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/modelforge/tests/test_training.py b/modelforge/tests/test_training.py index 2b862b88..72ee5e15 100644 --- a/modelforge/tests/test_training.py +++ b/modelforge/tests/test_training.py @@ -95,7 +95,7 @@ def test_train_with_lightning(training, potential_name, dataset_name): # SKIP if potential is ANI and dataset is SPICE2 if "ANI" in potential_name and dataset_name == "SPICE2": pytest.skip("ANI potential is not compatible with SPICE2 dataset") - if potential_name == "SAKE" and training == "with_force": + if IN_GITHUB_ACTIONS and potential_name == "SAKE" and training == "with_force": pytest.skip( "Skipping Sake training with forces on GitHub Actions because it allocates too much memory" ) From 4ffc4d5912ba9a38c0d9962235ff4c44bc99a272 Mon Sep 17 00:00:00 2001 From: wiederm Date: Sat, 31 Aug 2024 10:36:01 +0200 Subject: [PATCH 54/66] update weights --- scripts/config.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/config.toml b/scripts/config.toml index b49e1434..45e8d64c 100644 --- a/scripts/config.toml +++ b/scripts/config.toml @@ -57,7 +57,7 @@ interval = "epoch" loss_property = ['per_molecule_energy', 'per_atom_force'] # use [training.loss_parameter.weight] -per_molecule_energy = 0.009 #NOTE: reciprocal units +per_molecule_energy = 0.999 #NOTE: reciprocal units per_atom_force = 0.001 [training.early_stopping] From eb0ab52ad1be579d57314b9f54e265bcdabb7a4e Mon Sep 17 00:00:00 2001 From: wiederm Date: Sat, 31 Aug 2024 10:45:23 +0200 Subject: [PATCH 55/66] reorder imports and tests --- modelforge/dataset/dataset.py | 68 +++++++++++++++++++++++++ modelforge/tests/conftest.py | 88 +++++---------------------------- modelforge/tests/test_models.py | 46 +---------------- modelforge/utils/misc.py | 46 +++++++++++++++++ 4 files changed, 128 insertions(+), 120 deletions(-) diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index b388b317..f87121e1 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -1552,3 +1552,71 @@ def collate_conformers(conf_list: List[BatchData]) -> BatchData: number_of_atoms=atomic_numbers.numel(), ) return BatchData(nnp_input, metadata) + + +from modelforge.dataset.dataset import DatasetFactory +from modelforge.dataset.utils import ( + FirstComeFirstServeSplittingStrategy, + SplittingStrategy, +) + + +def initialize_datamodule( + dataset_name: str, + version_select: str = "nc_1000_v0", + batch_size: int = 64, + splitting_strategy: SplittingStrategy = FirstComeFirstServeSplittingStrategy(), + remove_self_energies: bool = True, + regression_ase: bool = False, + regenerate_dataset_statistic: bool = False, +) -> DataModule: + """ + Initialize a dataset for a given mode. + """ + + data_module = DataModule( + dataset_name, + splitting_strategy=splitting_strategy, + batch_size=batch_size, + version_select=version_select, + remove_self_energies=remove_self_energies, + regression_ase=regression_ase, + regenerate_dataset_statistic=regenerate_dataset_statistic, + ) + data_module.prepare_data() + data_module.setup() + return data_module + + +def single_batch(batch_size: int = 64, dataset_name="QM9"): + """ + Utility function to create a single batch of data for testing. + """ + data_module = initialize_datamodule( + dataset_name=dataset_name, + batch_size=batch_size, + version_select="nc_1000_v0", + ) + return next(iter(data_module.train_dataloader(shuffle=False))) + + +def initialize_dataset( + dataset_name: str, + local_cache_dir: str, + versions_select: str = "nc_1000_v0", + force_download: bool = False, +) -> DataModule: + """ + Initialize a dataset for a given mode. + """ + from modelforge.dataset import _ImplementedDatasets + + factory = DatasetFactory() + data = _ImplementedDatasets.get_dataset_class(dataset_name)( + local_cache_dir=local_cache_dir, + version_select=versions_select, + force_download=force_download, + ) + dataset = factory.create_dataset(data) + + return dataset diff --git a/modelforge/tests/conftest.py b/modelforge/tests/conftest.py index f7ccd939..a67ffcaf 100644 --- a/modelforge/tests/conftest.py +++ b/modelforge/tests/conftest.py @@ -1,9 +1,10 @@ -import torch +from dataclasses import dataclass +from typing import Dict, Optional + import pytest -from modelforge.dataset import DataModule +import torch -from typing import Optional, Dict -from dataclasses import dataclass +from modelforge.dataset import DataModule # let us setup a few pytest options @@ -37,39 +38,6 @@ def create_datamodule(**kwargs): return create_datamodule -from modelforge.dataset.utils import ( - FirstComeFirstServeSplittingStrategy, - SplittingStrategy, -) - - -def initialize_datamodule( - dataset_name: str, - version_select: str = "nc_1000_v0", - batch_size: int = 64, - splitting_strategy: SplittingStrategy = FirstComeFirstServeSplittingStrategy(), - remove_self_energies: bool = True, - regression_ase: bool = False, - regenerate_dataset_statistic: bool = False, -) -> DataModule: - """ - Initialize a dataset for a given mode. - """ - - data_module = DataModule( - dataset_name, - splitting_strategy=splitting_strategy, - batch_size=batch_size, - version_select=version_select, - remove_self_energies=remove_self_energies, - regression_ase=regression_ase, - regenerate_dataset_statistic=regenerate_dataset_statistic, - ) - data_module.prepare_data() - data_module.setup() - return data_module - - # dataset fixture @pytest.fixture def dataset_factory(): @@ -79,20 +47,11 @@ def create_dataset(**kwargs): return create_dataset -from modelforge.dataset.dataset import DatasetFactory, TorchDataset -from modelforge.dataset import _ImplementedDatasets - - -def single_batch(batch_size: int = 64, dataset_name="QM9"): - """ - Utility function to create a single batch of data for testing. - """ - data_module = initialize_datamodule( - dataset_name=dataset_name, - batch_size=batch_size, - version_select="nc_1000_v0", - ) - return next(iter(data_module.train_dataloader(shuffle=False))) +from modelforge.dataset.dataset import ( + initialize_datamodule, + initialize_dataset, + single_batch, +) @pytest.fixture(scope="session") @@ -107,27 +66,6 @@ def _create_single_batch(batch_size: int, dataset_name: str): return _create_single_batch -def initialize_dataset( - dataset_name: str, - local_cache_dir: str, - versions_select: str = "nc_1000_v0", - force_download: bool = False, -) -> DataModule: - """ - Initialize a dataset for a given mode. - """ - - factory = DatasetFactory() - data = _ImplementedDatasets.get_dataset_class(dataset_name)( - local_cache_dir=local_cache_dir, - version_select=versions_select, - force_download=force_download, - ) - dataset = factory.create_dataset(data) - - return dataset - - @pytest.fixture(scope="session") def prep_temp_dir(tmp_path_factory): import uuid @@ -150,7 +88,6 @@ class DataSetContainer: from modelforge.dataset import _ImplementedDatasets - dataset_container: Dict[str, DataSetContainer] = { "QM9": DataSetContainer( name="QM9", @@ -268,7 +205,7 @@ def methane() -> BatchData: ------- BatchData """ - from modelforge.potential.utils import Metadata, NNPInput, BatchData + from modelforge.potential.utils import BatchData, Metadata, NNPInput atomic_numbers = torch.tensor([6, 1, 1, 1, 1], dtype=torch.int64) positions = ( @@ -302,9 +239,10 @@ def methane() -> BatchData: ) -import torch import math +import torch + def generate_uniform_quaternion(u=None): """ diff --git a/modelforge/tests/test_models.py b/modelforge/tests/test_models.py index 4c9889e3..6c02b537 100644 --- a/modelforge/tests/test_models.py +++ b/modelforge/tests/test_models.py @@ -5,51 +5,7 @@ from modelforge.potential import NeuralNetworkPotentialFactory -def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): - from modelforge.tests.data import ( - potential_defaults, - training_defaults, - dataset_defaults, - runtime_defaults, - ) - from importlib import resources - import toml - - potential_path = ( - resources.files(potential_defaults) / f"{potential_name.lower()}.toml" - ) - dataset_path = resources.files(dataset_defaults) / f"{dataset_name.lower()}.toml" - training_path = resources.files(training_defaults) / "default.toml" - runtime_path = resources.files(runtime_defaults) / "runtime.toml" - - training_config_dict = toml.load(training_path) - dataset_config_dict = toml.load(dataset_path) - potential_config_dict = toml.load(potential_path) - runtime_config_dict = toml.load(runtime_path) - - potential_name = potential_config_dict["potential"]["potential_name"] - - from modelforge.potential import _Implemented_NNP_Parameters - - PotentialParameters = ( - _Implemented_NNP_Parameters.get_neural_network_parameter_class(potential_name) - ) - potential_parameters = PotentialParameters(**potential_config_dict["potential"]) - - from modelforge.dataset.dataset import DatasetParameters - from modelforge.train.parameters import TrainingParameters, RuntimeParameters - - dataset_parameters = DatasetParameters(**dataset_config_dict["dataset"]) - training_parameters = TrainingParameters(**training_config_dict["training"]) - runtime_parameters = RuntimeParameters(**runtime_config_dict["runtime"]) - - return { - "potential": potential_parameters, - "dataset": dataset_parameters, - "training": training_parameters, - "runtime": runtime_parameters, - } - +from modelforge.utils.misc import load_configs_into_pydantic_models @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() diff --git a/modelforge/utils/misc.py b/modelforge/utils/misc.py index f4e918f2..3a65e17d 100644 --- a/modelforge/utils/misc.py +++ b/modelforge/utils/misc.py @@ -372,3 +372,49 @@ def wrapper(*args, **kwargs): return wrapper return decorator + + +def load_configs_into_pydantic_models(potential_name: str, dataset_name: str): + from modelforge.tests.data import ( + potential_defaults, + training_defaults, + dataset_defaults, + runtime_defaults, + ) + from importlib import resources + import toml + + potential_path = ( + resources.files(potential_defaults) / f"{potential_name.lower()}.toml" + ) + dataset_path = resources.files(dataset_defaults) / f"{dataset_name.lower()}.toml" + training_path = resources.files(training_defaults) / "default.toml" + runtime_path = resources.files(runtime_defaults) / "runtime.toml" + + training_config_dict = toml.load(training_path) + dataset_config_dict = toml.load(dataset_path) + potential_config_dict = toml.load(potential_path) + runtime_config_dict = toml.load(runtime_path) + + potential_name = potential_config_dict["potential"]["potential_name"] + + from modelforge.potential import _Implemented_NNP_Parameters + + PotentialParameters = ( + _Implemented_NNP_Parameters.get_neural_network_parameter_class(potential_name) + ) + potential_parameters = PotentialParameters(**potential_config_dict["potential"]) + + from modelforge.dataset.dataset import DatasetParameters + from modelforge.train.parameters import TrainingParameters, RuntimeParameters + + dataset_parameters = DatasetParameters(**dataset_config_dict["dataset"]) + training_parameters = TrainingParameters(**training_config_dict["training"]) + runtime_parameters = RuntimeParameters(**runtime_config_dict["runtime"]) + + return { + "potential": potential_parameters, + "dataset": dataset_parameters, + "training": training_parameters, + "runtime": runtime_parameters, + } From 9f78d4d9f7685e72aeb8a25dec0fe0f94146dd0d Mon Sep 17 00:00:00 2001 From: wiederm Date: Sat, 31 Aug 2024 10:48:16 +0200 Subject: [PATCH 56/66] formatting changes --- modelforge/tests/test_models.py | 1 + 1 file changed, 1 insertion(+) diff --git a/modelforge/tests/test_models.py b/modelforge/tests/test_models.py index 6c02b537..b6a4dc6f 100644 --- a/modelforge/tests/test_models.py +++ b/modelforge/tests/test_models.py @@ -7,6 +7,7 @@ from modelforge.utils.misc import load_configs_into_pydantic_models + @pytest.mark.parametrize( "potential_name", _Implemented_NNPs.get_all_neural_network_names() ) From 813728f9966bb19bfb378da0eaac751cd7a6add9 Mon Sep 17 00:00:00 2001 From: Ian Kenney Date: Thu, 5 Sep 2024 09:33:47 -0700 Subject: [PATCH 57/66] Added classifiers, URLs, and bumped Python version Additionally added TODO tags and removed noisy comments --- pyproject.toml | 39 ++++++++++++++++----------------------- 1 file changed, 16 insertions(+), 23 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b6ca3463..a7da7488 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,8 @@ [build-system] +# TODO: versioningit is now in v3 requires = ["setuptools>=61.0", "versioningit~=2.0"] build-backend = "setuptools.build_meta" -# Self-descriptive entries which should always be present -# https://packaging.python.org/en/latest/specifications/declaring-project-metadata/ [project] name = "modelforge" description = "Infrastructure to implement and train NNPs" @@ -17,17 +16,22 @@ license = { text = "MIT" } classifiers = [ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", + "Environment :: GPU", + "Environment :: GPU :: NVIDIA CUDA", + "Intended Audience :: Science/Research", + "Operating System :: POSIX :: Linux", + "Topic :: Scientific/Engineering", ] -requires-python = ">=3.8" +requires-python = ">=3.10" # Declare any run-time dependencies that should be installed with the package. #dependencies = [ # "importlib-resources;python_version<'3.10'", #] -# Update the urls once the hosting is set up. -#[project.urls] -#"Source" = "https://github.com//modelforge/" -#"Documentation" = "https://modelforge.readthedocs.io/" +[project.urls] +Source = "https://github.com/choderalab/modelforge" +Documentation = "https://modelforge.readthedocs.io/" +Wiki = "https://github.com/choderalab/modelforge/wiki" [project.optional-dependencies] test = [ @@ -36,8 +40,7 @@ test = [ ] [tool.setuptools] -# This subkey is a beta stage development and keys may change in the future, see https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html for more details -# +# TODO: is this next comment still relevant? # As of version 0.971, mypy does not support type checking of installed zipped # packages (because it does not actually import the Python packages). # We declare the package not-zip-safe so that our type hints are also available @@ -45,17 +48,8 @@ test = [ # Ref: # https://mypy.readthedocs.io/en/stable/installed_packages.html?highlight=zip#using-installed-packages-with-mypy-pep-561 zip-safe = false -# Let setuptools discover the package in the current directory, -# but be explicit about non-Python files. -# See also: -# https://setuptools.pypa.io/en/latest/userguide/pyproject_config.html#setuptools-specific-configuration -# Note that behavior is currently evolving with respect to how to interpret the -# "data" and "tests" subdirectories. As of setuptools 63, both are automatically -# included if namespaces is true (default), even if the package is named explicitly -# (instead of using 'find'). With 'find', the 'tests' subpackage is discovered -# recursively because of its __init__.py file, but the data subdirectory is excluded -# with include-package-data = false and namespaces = false. include-package-data = true + [tool.setuptools.packages.find] namespaces = false where = ["."] @@ -66,6 +60,7 @@ modelforge = [ "py.typed" ] +# https://versioningit.readthedocs.io/en/stable/configuration.html# [tool.versioningit] default-version = "1+unknown" @@ -75,9 +70,7 @@ dirty = "{base_version}+{distance}.{vcs}{rev}.dirty" distance-dirty = "{base_version}+{distance}.{vcs}{rev}.dirty" [tool.versioningit.vcs] -# The method key: -method = "git" # <- The method name -# Parameters to pass to the method: +method = "git" match = ["*"] default-tag = "1.0.0" @@ -90,7 +83,7 @@ file = "modelforge/_version.py" omit = [ # Omit the tests "*/tests/*", - # Omit generated versioneer + # Omit generated versioningit "modelforge/_version.py" ] From 7909d8002950d36c13346f0b7cdb2fd99e0b2708 Mon Sep 17 00:00:00 2001 From: Ian Kenney Date: Thu, 5 Sep 2024 11:37:00 -0700 Subject: [PATCH 58/66] Clarified why we use zip-safe in pyproject.toml --- pyproject.toml | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index a7da7488..597af3b5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,13 +40,10 @@ test = [ ] [tool.setuptools] -# TODO: is this next comment still relevant? -# As of version 0.971, mypy does not support type checking of installed zipped -# packages (because it does not actually import the Python packages). -# We declare the package not-zip-safe so that our type hints are also available -# when checking client code that uses our (installed) package. -# Ref: +# Disable zipping because mypy cannot read zip imports and this will affect downstream development. # https://mypy.readthedocs.io/en/stable/installed_packages.html?highlight=zip#using-installed-packages-with-mypy-pep-561 +# NOTE: We might consider removing this once we can test the code in a +# production environment since zipping the package may increase performance. zip-safe = false include-package-data = true From 45b14bdde65d112aeb0c616195ec372f009da4d0 Mon Sep 17 00:00:00 2001 From: Ian Kenney Date: Thu, 5 Sep 2024 11:58:39 -0700 Subject: [PATCH 59/66] Bumped versioningit version --- pyproject.toml | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 597af3b5..6ca9f53d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,5 @@ [build-system] -# TODO: versioningit is now in v3 -requires = ["setuptools>=61.0", "versioningit~=2.0"] +requires = ["setuptools>=61.0", "versioningit~=3.0"] build-backend = "setuptools.build_meta" [project] @@ -16,17 +15,16 @@ license = { text = "MIT" } classifiers = [ "License :: OSI Approved :: MIT License", "Programming Language :: Python :: 3", - "Environment :: GPU", - "Environment :: GPU :: NVIDIA CUDA", "Intended Audience :: Science/Research", - "Operating System :: POSIX :: Linux", "Topic :: Scientific/Engineering", + "Operating System :: POSIX :: Linux", + "Environment :: GPU", + "Environment :: GPU :: NVIDIA CUDA", ] requires-python = ">=3.10" -# Declare any run-time dependencies that should be installed with the package. -#dependencies = [ -# "importlib-resources;python_version<'3.10'", -#] +# dependencies are specified in devtools/conda-envs/test_env.yaml, setuptools +# will not install package dependencies +dependencies = [] [project.urls] Source = "https://github.com/choderalab/modelforge" @@ -40,7 +38,7 @@ test = [ ] [tool.setuptools] -# Disable zipping because mypy cannot read zip imports and this will affect downstream development. +# Disable zipping because mypy cannot read zip imports and this may affect downstream development. # https://mypy.readthedocs.io/en/stable/installed_packages.html?highlight=zip#using-installed-packages-with-mypy-pep-561 # NOTE: We might consider removing this once we can test the code in a # production environment since zipping the package may increase performance. From 43bf627fa0d8aced4720609ff6283de24871576c Mon Sep 17 00:00:00 2001 From: Ian Kenney Date: Thu, 5 Sep 2024 12:29:44 -0700 Subject: [PATCH 60/66] Removed pyproject.toml testing optional-dependencies --- pyproject.toml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 6ca9f53d..c9652872 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,21 +22,12 @@ classifiers = [ "Environment :: GPU :: NVIDIA CUDA", ] requires-python = ">=3.10" -# dependencies are specified in devtools/conda-envs/test_env.yaml, setuptools -# will not install package dependencies -dependencies = [] [project.urls] Source = "https://github.com/choderalab/modelforge" Documentation = "https://modelforge.readthedocs.io/" Wiki = "https://github.com/choderalab/modelforge/wiki" -[project.optional-dependencies] -test = [ - "pytest>=6.1.2", - "pytest-runner" -] - [tool.setuptools] # Disable zipping because mypy cannot read zip imports and this may affect downstream development. # https://mypy.readthedocs.io/en/stable/installed_packages.html?highlight=zip#using-installed-packages-with-mypy-pep-561 From 1836a16105702076442d9588c86fd35636520219 Mon Sep 17 00:00:00 2001 From: Ian Kenney Date: Thu, 5 Sep 2024 12:37:28 -0700 Subject: [PATCH 61/66] Added a non-testing conda environment file Co-authored-by: chrisiacovella --- devtools/conda-envs/env.yaml | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 devtools/conda-envs/env.yaml diff --git a/devtools/conda-envs/env.yaml b/devtools/conda-envs/env.yaml new file mode 100644 index 00000000..ee9ff70c --- /dev/null +++ b/devtools/conda-envs/env.yaml @@ -0,0 +1,27 @@ +name: test +channels: + - conda-forge + - pytorch +dependencies: + # Base depends + - python + - pip + - h5py + - tqdm + - toml + - qcportal>=0.50 + - qcelemental + - pytorch>=2.1 + - loguru + - lightning>=2.0.8 + - tensorboard + - torchvision + - openff-units + - torchmetrics>=1.4 + - pint=0.23 + - rdkit + - retry + - sqlitedict + - pydantic>=2 + - ray-all + - jax From a7be95cedcf772f2bf04c4b7f542c41dd131c4d9 Mon Sep 17 00:00:00 2001 From: Ian Kenney Date: Thu, 5 Sep 2024 12:42:48 -0700 Subject: [PATCH 62/66] Update env.yaml --- devtools/conda-envs/env.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/devtools/conda-envs/env.yaml b/devtools/conda-envs/env.yaml index ee9ff70c..8535f561 100644 --- a/devtools/conda-envs/env.yaml +++ b/devtools/conda-envs/env.yaml @@ -1,4 +1,4 @@ -name: test +name: modelforge_env channels: - conda-forge - pytorch From d11efe794d2d3abad4dc356cdcceb431bae99668 Mon Sep 17 00:00:00 2001 From: MarshallYan Date: Sat, 7 Sep 2024 00:07:27 -0400 Subject: [PATCH 63/66] change highest_atomic_number to maximum_atomic number for code consistency with other NNPs --- modelforge/potential/parameters.py | 2 +- modelforge/potential/tensornet.py | 24 ++++++++++++------------ 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/modelforge/potential/parameters.py b/modelforge/potential/parameters.py index dadc8117..a52fd6ab 100644 --- a/modelforge/potential/parameters.py +++ b/modelforge/potential/parameters.py @@ -199,7 +199,7 @@ class CoreParameter(ParametersBase): number_of_radial_basis_functions: int maximum_interaction_radius: Union[str, unit.Quantity] minimum_interaction_radius: Union[str, unit.Quantity] - highest_atomic_number: int + maximum_atomic_number: int equivariance_invariance_group: str activation_function_parameter: ActivationFunctionConfig diff --git a/modelforge/potential/tensornet.py b/modelforge/potential/tensornet.py index 3e9079ad..508954e5 100644 --- a/modelforge/potential/tensornet.py +++ b/modelforge/potential/tensornet.py @@ -192,8 +192,8 @@ class TensorNet(BaseNetwork): Maximum interaction radius. minimum_interaction_radius : unit.Quantity Minimum interaction radius. - highest_atomic_number : int - Highest atomic number in the dataset. + maximum_atomic_number : int + Maximum atomic number in the dataset. equivariance_invariance_group : str Equivariance invariance group, either "O(3)" or "SO(3)". activation_function_parameter : Dict @@ -214,7 +214,7 @@ def __init__( number_of_radial_basis_functions: int, maximum_interaction_radius: unit.Quantity, minimum_interaction_radius: unit.Quantity, - highest_atomic_number: int, + maximum_atomic_number: int, equivariance_invariance_group: str, activation_function_parameter: Dict, postprocessing_parameter: Dict[str, Dict[str, bool]], @@ -239,7 +239,7 @@ def __init__( maximum_interaction_radius=_convert_str_to_unit(maximum_interaction_radius), minimum_interaction_radius=_convert_str_to_unit(minimum_interaction_radius), trainable_centers_and_scale_factors=False, - highest_atomic_number=highest_atomic_number, + maximum_atomic_number=maximum_atomic_number, equivariance_invariance_group=equivariance_invariance_group, activation_function=activation_function, ) @@ -263,8 +263,8 @@ class TensorNetCore(CoreNetwork): Minimum interaction radius. trainable_centers_and_scale_factors : bool If True, centers and scale factors are trainable. - highest_atomic_number : int - Highest atomic number in the dataset. + maximum_atomic_number : int + Maximum atomic number in the dataset. equivariance_invariance_group : str Equivariance invariance group, either "O(3)" or "SO(3)". activation_function : Type[torch.nn.Module] @@ -279,7 +279,7 @@ def __init__( maximum_interaction_radius: unit.Quantity, minimum_interaction_radius: unit.Quantity, trainable_centers_and_scale_factors: bool, - highest_atomic_number: int, + maximum_atomic_number: int, equivariance_invariance_group: str, activation_function: Type[torch.nn.Module], seed: int = 0, @@ -294,7 +294,7 @@ def __init__( maximum_interaction_radius=maximum_interaction_radius, minimum_interaction_radius=minimum_interaction_radius, trainable_centers_and_scale_factors=trainable_centers_and_scale_factors, - highest_atomic_number=highest_atomic_number, + maximum_atomic_number=maximum_atomic_number, ) self.interaction_modules = nn.ModuleList( [ @@ -423,8 +423,8 @@ class TensorNetRepresentation(torch.nn.Module): Minimum interaction radius. trainable_centers_and_scale_factors : bool If True, centers and scale factors are trainable. - highest_atomic_number : int - Highest atomic number in the dataset. + maximum_atomic_number : int + Maximum atomic number in the dataset. """ def __init__( @@ -435,7 +435,7 @@ def __init__( maximum_interaction_radius: unit.Quantity, minimum_interaction_radius: unit.Quantity, trainable_centers_and_scale_factors: bool, - highest_atomic_number: int, + maximum_atomic_number: int, ): super().__init__() from modelforge.potential.utils import Dense @@ -467,7 +467,7 @@ def __init__( } ) self.atomic_number_i_embedding_layer = nn.Embedding( - highest_atomic_number, + maximum_atomic_number, number_of_per_atom_features, ) self.atomic_number_ij_embedding_layer = nn.Linear( From 65c348442cffb13323ac775cbc30a7f2cbb94449 Mon Sep 17 00:00:00 2001 From: MarshallYan Date: Sat, 7 Sep 2024 00:55:39 -0400 Subject: [PATCH 64/66] replace usage in config and tests --- .../data/potential_defaults/tensornet.toml | 2 +- modelforge/tests/test_tensornet.py | 6 ++-- scripts/config.toml | 30 +++++++++---------- 3 files changed, 19 insertions(+), 19 deletions(-) diff --git a/modelforge/tests/data/potential_defaults/tensornet.toml b/modelforge/tests/data/potential_defaults/tensornet.toml index 4805578b..e978d0c3 100644 --- a/modelforge/tests/data/potential_defaults/tensornet.toml +++ b/modelforge/tests/data/potential_defaults/tensornet.toml @@ -7,7 +7,7 @@ number_of_interaction_layers = 2 number_of_radial_basis_functions = 16 maximum_interaction_radius = "5.1 angstrom" minimum_interaction_radius = "0.0 angstrom" -highest_atomic_number = 128 +maximum_atomic_number = 128 equivariance_invariance_group = "O(3)" [potential.core_parameter.activation_function_parameter] diff --git a/modelforge/tests/test_tensornet.py b/modelforge/tests/test_tensornet.py index 92ed271c..305c8332 100644 --- a/modelforge/tests/test_tensornet.py +++ b/modelforge/tests/test_tensornet.py @@ -224,7 +224,7 @@ def test_representation(): cutoff_lower = 0.0 cutoff_upper = 5.1 trainable_rbf = False - highest_atomic_number = 128 + maximum_atomic_number = 128 # Set up a dataset # prepare reference value @@ -266,7 +266,7 @@ def test_representation(): cutoff_upper * unit.angstrom, cutoff_lower * unit.angstrom, trainable_rbf, - highest_atomic_number, + maximum_atomic_number, ) nnp_input = tensornet.core_module._model_specific_input_preparation( mf_input, pairlist_output @@ -286,7 +286,7 @@ def test_representation(): cutoff_lower, cutoff_upper, trainable_rbf, - highest_atomic_number, + maximum_atomic_number, seed, ) ################ torchmd-net TensorNet ################ diff --git a/scripts/config.toml b/scripts/config.toml index 45e8d64c..898cbf02 100644 --- a/scripts/config.toml +++ b/scripts/config.toml @@ -1,29 +1,29 @@ [potential] -potential_name = "ANI2x" +potential_name = "TensorNet" [potential.core_parameter] -angle_sections = 4 -maximum_interaction_radius = "5.1 angstrom" -minimum_interaction_radius = "0.8 angstrom" -number_of_radial_basis_functions = 16 -maximum_interaction_radius_for_angular_features = "3.5 angstrom" -minimum_interaction_radius_for_angular_features = "0.8 angstrom" -angular_dist_divisions = 8 +number_of_per_atom_features = 128 +number_of_interaction_layers = 2 +number_of_radial_basis_functions = 32 +maximum_interaction_radius = "10.0 angstrom" +minimum_interaction_radius = "0.0 angstrom" +maximum_atomic_number = 128 +equivariance_invariance_group = "O(3)" [potential.core_parameter.activation_function_parameter] -activation_function_name = "CeLU" # for the original ANI behavior please stick with CeLu since the alpha parameter is currently hard coded and might lead to different behavior when another activation function is used. - -[potential.core_parameter.activation_function_parameter.activation_function_arguments] -alpha = 0.1 +activation_function_name = "SiLU" [potential.postprocessing_parameter] [potential.postprocessing_parameter.per_atom_energy] normalize = true from_atom_to_molecule_reduction = true keep_per_atom_property = true +[potential.postprocessing_parameter.general_postprocessing_operation] +calculate_molecular_self_energy = true + [dataset] -dataset_name = "PHALKETHOH" +dataset_name = "SPICE2" version_select = "nc_1000_v0" num_workers = 4 pin_memory = true @@ -31,7 +31,7 @@ pin_memory = true [training] number_of_epochs = 1000 remove_self_energies = true -batch_size = 16 +batch_size = 32 lr = 0.5e-3 monitor_for_checkpoint = "val/per_molecule_energy/rmse" @@ -75,7 +75,7 @@ seed = 42 save_dir = "test_setup" experiment_name = "{potential_name}_{dataset_name}" local_cache_dir = "./cache" -accelerator = "cpu" +accelerator = "gpu" number_of_nodes = 1 devices = 1 #[0,1,2,3] checkpoint_path = "None" From aa9068fe80b017ae641d9329ec099d89ff364dac Mon Sep 17 00:00:00 2001 From: MarshallYan Date: Sun, 8 Sep 2024 13:30:08 -0400 Subject: [PATCH 65/66] revert config.toml --- scripts/config.toml | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/scripts/config.toml b/scripts/config.toml index 898cbf02..e7fc2563 100644 --- a/scripts/config.toml +++ b/scripts/config.toml @@ -1,29 +1,29 @@ [potential] -potential_name = "TensorNet" +potential_name = "ANI2x" [potential.core_parameter] -number_of_per_atom_features = 128 -number_of_interaction_layers = 2 -number_of_radial_basis_functions = 32 -maximum_interaction_radius = "10.0 angstrom" -minimum_interaction_radius = "0.0 angstrom" -maximum_atomic_number = 128 -equivariance_invariance_group = "O(3)" +angle_sections = 4 +maximum_interaction_radius = "5.1 angstrom" +minimum_interaction_radius = "0.8 angstrom" +number_of_radial_basis_functions = 16 +maximum_interaction_radius_for_angular_features = "3.5 angstrom" +minimum_interaction_radius_for_angular_features = "0.8 angstrom" +angular_dist_divisions = 8 [potential.core_parameter.activation_function_parameter] -activation_function_name = "SiLU" +activation_function_name = "CeLU" # for the original ANI behavior please stick with CeLu since the alpha parameter is currently hard coded and might lead to different behavior when another activation function is used. + +[potential.core_parameter.activation_function_parameter.activation_function_arguments] +alpha = 0.1 [potential.postprocessing_parameter] [potential.postprocessing_parameter.per_atom_energy] normalize = true from_atom_to_molecule_reduction = true keep_per_atom_property = true -[potential.postprocessing_parameter.general_postprocessing_operation] -calculate_molecular_self_energy = true - [dataset] -dataset_name = "SPICE2" +dataset_name = "PHALKETHOH" version_select = "nc_1000_v0" num_workers = 4 pin_memory = true @@ -31,7 +31,7 @@ pin_memory = true [training] number_of_epochs = 1000 remove_self_energies = true -batch_size = 32 +batch_size = 16 lr = 0.5e-3 monitor_for_checkpoint = "val/per_molecule_energy/rmse" @@ -75,9 +75,9 @@ seed = 42 save_dir = "test_setup" experiment_name = "{potential_name}_{dataset_name}" local_cache_dir = "./cache" -accelerator = "gpu" +accelerator = "cpu" number_of_nodes = 1 devices = 1 #[0,1,2,3] checkpoint_path = "None" simulation_environment = "PyTorch" -log_every_n_steps = 1 +log_every_n_steps = 1 \ No newline at end of file From d0647fe4906df9b5133fc3779284a76542c6c0d6 Mon Sep 17 00:00:00 2001 From: wiederm Date: Fri, 13 Sep 2024 19:52:31 +0200 Subject: [PATCH 66/66] linting changes --- modelforge/potential/sake.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modelforge/potential/sake.py b/modelforge/potential/sake.py index 39287218..6e0ff14e 100644 --- a/modelforge/potential/sake.py +++ b/modelforge/potential/sake.py @@ -224,7 +224,6 @@ def compute_properties(self, data: SAKENeuralNetworkInput): return results - class SAKEInteraction(nn.Module): """ Spatial Attention Kinetic Networks Layer.