From 225bad41ea7ddf6a6c274e1af2599b3523d660de Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Wed, 28 Aug 2024 20:33:32 -0700
Subject: [PATCH 1/6] enable removing of high energy states from phalkethoh
 dataset

---
 modelforge/curation/phalkethoh_curation.py    | 201 ++++++++++--------
 .../curation/scripts/curate_PhAlkEthOH.py     |   9 +-
 modelforge/dataset/dataset.py                 |   2 +
 3 files changed, 122 insertions(+), 90 deletions(-)

diff --git a/modelforge/curation/phalkethoh_curation.py b/modelforge/curation/phalkethoh_curation.py
index 600ef8ca..21c5e346 100644
--- a/modelforge/curation/phalkethoh_curation.py
+++ b/modelforge/curation/phalkethoh_curation.py
@@ -267,7 +267,7 @@ def _calculate_total_charge(
         rdmol = Chem.MolFromSmiles(smiles, sanitize=False)
         total_charge = sum(atom.GetFormalCharge() for atom in rdmol.GetAtoms())
 
-        return (int(total_charge) * unit.elementary_charge,)
+        return int(total_charge) * unit.elementary_charge
 
     def _process_downloaded(
         self,
@@ -277,6 +277,7 @@ def _process_downloaded(
         max_conformers_per_record: Optional[int] = None,
         total_conformers: Optional[int] = None,
         atomic_numbers_to_limit: Optional[List[int]] = None,
+        max_force: Optional[unit.Quantity] = None,
     ):
         """
         Processes a downloaded dataset: extracts relevant information.
@@ -295,6 +296,8 @@ def _process_downloaded(
             If set, this will limit the total number of conformers to the specified number.
         atomic_numbers_to_limit: Optional[List[int]], optional, default=None
             If set, this will limit the dataset to only include molecules with atomic numbers in the list.
+        max_force: Optional[float], optional, default=None
+            If set, this will exclude any conformers with a force that exceeds this value.
         """
         from tqdm import tqdm
         import numpy as np
@@ -358,7 +361,7 @@ def _process_downloaded(
                         ]
                         data_temp["n_configs"] = 0
 
-                        (data_temp["total_charge"],) = self._calculate_total_charge(
+                        data_temp["total_charge"] = self._calculate_total_charge(
                             data_temp[
                                 "canonical_isomeric_explicit_hydrogen_mapped_smiles"
                             ]
@@ -378,104 +381,120 @@ def _process_downloaded(
                     index = self.molecule_names[name]
 
                     for state in trajectory:
+                        add_record = True
                         properties, config = state
-                        self.data[index]["n_configs"] += 1
-
-                        # note, we will use the convention of names being lowercase
-                        # and spaces denoted by underscore
-                        quantity = "geometry"
-                        quantity_o = "geometry"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = config.reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    config.reshape(1, -1, 3),
+
+                        # if set, let us see if the configuration has a force that exceeds the maximum
+                        if max_force is not None:
+                            force_magnitude = (
+                                np.abs(
+                                    properties["properties"]["current gradient"]
+                                    + properties["properties"][
+                                        "dispersion correction gradient"
+                                    ]
                                 )
+                                * self.qm_parameters["dft_total_force"]["u_in"]
                             )
+                            if np.any(force_magnitude > max_force):
+                                add_record = False
+                        if add_record:
+                            self.data[index]["n_configs"] += 1
+
+                            # note, we will use the convention of names being lowercase
+                            # and spaces denoted by underscore
+                            quantity = "geometry"
+                            quantity_o = "geometry"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = config.reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        config.reshape(1, -1, 3),
+                                    )
+                                )
 
-                        # note, we will use the convention of names being lowercase
-                        # and spaces denoted by underscore
-                        quantity = "current energy"
-                        quantity_o = "dft_total_energy"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = properties["properties"][
-                                quantity
-                            ]
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    properties["properties"][quantity],
+                            # note, we will use the convention of names being lowercase
+                            # and spaces denoted by underscore
+                            quantity = "current energy"
+                            quantity_o = "dft_total_energy"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = properties["properties"][
+                                    quantity
+                                ]
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        properties["properties"][quantity],
+                                    )
                                 )
-                            )
 
-                        quantity = "dispersion correction energy"
-                        quantity_o = "dispersion_correction_energy"
-                        # Note need to typecast here because of a bug in the
-                        # qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                float(properties["properties"][quantity])
-                            ).reshape(1, 1)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        float(properties["properties"][quantity])
-                                    ).reshape(1, 1),
-                                ),
-                            )
+                            quantity = "dispersion correction energy"
+                            quantity_o = "dispersion_correction_energy"
+                            # Note need to typecast here because of a bug in the
+                            # qcarchive entry: see issue: https://github.com/MolSSI/QCFractal/issues/766
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    float(properties["properties"][quantity])
+                                ).reshape(1, 1)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            float(properties["properties"][quantity])
+                                        ).reshape(1, 1),
+                                    ),
+                                )
 
-                        quantity = "current gradient"
-                        quantity_o = "dft_total_gradient"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, -1, 3),
+                            quantity = "current gradient"
+                            quantity_o = "dft_total_gradient"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, -1, 3),
+                                    )
                                 )
-                            )
 
-                        quantity = "dispersion correction gradient"
-                        quantity_o = "dispersion_correction_gradient"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, -1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, -1, 3),
+                            quantity = "dispersion correction gradient"
+                            quantity_o = "dispersion_correction_gradient"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, -1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, -1, 3),
+                                    )
                                 )
-                            )
 
-                        quantity = "scf dipole"
-                        quantity_o = "scf_dipole"
-                        if quantity_o not in self.data[index].keys():
-                            self.data[index][quantity_o] = np.array(
-                                properties["properties"][quantity]
-                            ).reshape(1, 3)
-                        else:
-                            self.data[index][quantity_o] = np.vstack(
-                                (
-                                    self.data[index][quantity_o],
-                                    np.array(
-                                        properties["properties"][quantity]
-                                    ).reshape(1, 3),
+                            quantity = "scf dipole"
+                            quantity_o = "scf_dipole"
+                            if quantity_o not in self.data[index].keys():
+                                self.data[index][quantity_o] = np.array(
+                                    properties["properties"][quantity]
+                                ).reshape(1, 3)
+                            else:
+                                self.data[index][quantity_o] = np.vstack(
+                                    (
+                                        self.data[index][quantity_o],
+                                        np.array(
+                                            properties["properties"][quantity]
+                                        ).reshape(1, 3),
+                                    )
                                 )
-                            )
 
         # assign units
         for datapoint in self.data:
@@ -564,6 +583,7 @@ def process(
         max_conformers_per_record: Optional[int] = None,
         total_conformers: Optional[int] = None,
         limit_atomic_species: Optional[list] = None,
+        max_force: Optional[unit.Quantity] = None,
         n_threads=2,
     ) -> None:
         """
@@ -586,7 +606,9 @@ def process(
             Note defining this will only fetch from the "SPICE PubChem Set 1 Single Points Dataset v1.2"
         limit_atomic_species: Optional[list] = None,
             If set to a list of element symbols, records that contain any elements not in this list will be ignored.
-        n_threads, int, default=6
+        max_force: Optional[float], optional, default=None
+            If set this any confirugrations with a force that exceeds this value will be excluded.
+        n_threads, int, default=2
             Number of concurrent threads for retrieving data from QCArchive
         Examples
         --------
@@ -664,6 +686,7 @@ def process(
             max_conformers_per_record=max_conformers_per_record,
             total_conformers=total_conformers,
             atomic_numbers_to_limit=self.atomic_numbers_to_limit,
+            max_force=max_force,
         )
 
         self._generate_hdf5()
diff --git a/modelforge/curation/scripts/curate_PhAlkEthOH.py b/modelforge/curation/scripts/curate_PhAlkEthOH.py
index 6dfab740..523464d2 100644
--- a/modelforge/curation/scripts/curate_PhAlkEthOH.py
+++ b/modelforge/curation/scripts/curate_PhAlkEthOH.py
@@ -20,6 +20,7 @@ def PhAlkEthOH_openff_wrapper(
     max_conformers_per_record=None,
     total_conformers=None,
     limit_atomic_species=None,
+    max_force=None,
 ):
     """
     This curates and processes the SPICE 114 dataset at the OpenFF level of theory into an hdf5 file.
@@ -49,7 +50,8 @@ def PhAlkEthOH_openff_wrapper(
     limit_atomic_species: list, optional, default=None
         A list of atomic species to limit the dataset to. Any molecules that contain elements outside of this list
         will be ignored. If not defined, no filtering by atomic species will be performed.
-
+    max_force: float, optional, default=None
+        The maximum force to allow in the dataset. Any conformers with forces greater than this value will be ignored.
 
     """
     from modelforge.curation.phalkethoh_curation import PhAlkEthOHCuration
@@ -67,6 +69,7 @@ def PhAlkEthOH_openff_wrapper(
         total_conformers=total_conformers,
         limit_atomic_species=limit_atomic_species,
         n_threads=1,
+        max_force=max_force,
     )
     print(f"Total records: {PhAlkEthOH_dataset.total_records}")
     print(f"Total conformers: {PhAlkEthOH_dataset.total_conformers}")
@@ -74,6 +77,8 @@ def PhAlkEthOH_openff_wrapper(
 
 def main():
 
+    from openff.units import unit
+
     # define the location where to store and output the files
     import os
 
@@ -99,6 +104,7 @@ def main():
         max_records=1000,
         total_conformers=1000,
         max_conformers_per_record=10,
+        max_force=1.0 * unit.hartree / unit.bohr,
     )
 
     # curate the full dataset
@@ -110,6 +116,7 @@ def main():
         local_cache_dir,
         force_download=False,
         version_select=version_select,
+        max_force=1.0 * unit.hartree / unit.bohr,
     )
 
 
diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 53142d86..07a80f69 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -488,6 +488,8 @@ def __init__(
             Directory to store the files.
         force_download : bool, optional
             If set to True, the data will be downloaded even if it already exists. Default is False.
+        regenerate_cache : bool, optional
+            If set to True, the cache file will be regenerated even if it already exists. Default is False.
         """
         self.url = url
         self.gz_data_file = gz_data_file

From 904d3901b354d55c6a2a86455631be39cbee6dfe Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Thu, 29 Aug 2024 09:13:01 -0700
Subject: [PATCH 2/6] added v1 phalkethoh

---
 modelforge/curation/phalkethoh_curation.py    | 10 +++++
 .../curation/scripts/curate_PhAlkEthOH.py     | 37 ++++++++++++++++++-
 modelforge/dataset/yaml_files/PhAlkEthOH.yaml | 15 ++++++++
 3 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/modelforge/curation/phalkethoh_curation.py b/modelforge/curation/phalkethoh_curation.py
index 21c5e346..52d1ec2f 100644
--- a/modelforge/curation/phalkethoh_curation.py
+++ b/modelforge/curation/phalkethoh_curation.py
@@ -278,6 +278,7 @@ def _process_downloaded(
         total_conformers: Optional[int] = None,
         atomic_numbers_to_limit: Optional[List[int]] = None,
         max_force: Optional[unit.Quantity] = None,
+        final_conformer_only: Optional[bool] = None,
     ):
         """
         Processes a downloaded dataset: extracts relevant information.
@@ -298,6 +299,9 @@ def _process_downloaded(
             If set, this will limit the dataset to only include molecules with atomic numbers in the list.
         max_force: Optional[float], optional, default=None
             If set, this will exclude any conformers with a force that exceeds this value.
+        final_conformer_only: Optional[bool], optional, default=None
+            If set to True, only the final conformer of each record will be processed. This should be the final
+            energy minimized conformer.
         """
         from tqdm import tqdm
         import numpy as np
@@ -380,6 +384,8 @@ def _process_downloaded(
                     name = key
                     index = self.molecule_names[name]
 
+                    if final_conformer_only:
+                        trajectory = [trajectory[-1]]
                     for state in trajectory:
                         add_record = True
                         properties, config = state
@@ -584,6 +590,7 @@ def process(
         total_conformers: Optional[int] = None,
         limit_atomic_species: Optional[list] = None,
         max_force: Optional[unit.Quantity] = None,
+        final_conformer_only=None,
         n_threads=2,
     ) -> None:
         """
@@ -608,6 +615,8 @@ def process(
             If set to a list of element symbols, records that contain any elements not in this list will be ignored.
         max_force: Optional[float], optional, default=None
             If set this any confirugrations with a force that exceeds this value will be excluded.
+        final_conformer_only: Optional[bool], optional, default=None
+            If set to True, only the final conformer of each record will be processed.
         n_threads, int, default=2
             Number of concurrent threads for retrieving data from QCArchive
         Examples
@@ -687,6 +696,7 @@ def process(
             total_conformers=total_conformers,
             atomic_numbers_to_limit=self.atomic_numbers_to_limit,
             max_force=max_force,
+            final_conformer_only=final_conformer_only,
         )
 
         self._generate_hdf5()
diff --git a/modelforge/curation/scripts/curate_PhAlkEthOH.py b/modelforge/curation/scripts/curate_PhAlkEthOH.py
index 523464d2..cecff1ff 100644
--- a/modelforge/curation/scripts/curate_PhAlkEthOH.py
+++ b/modelforge/curation/scripts/curate_PhAlkEthOH.py
@@ -21,6 +21,7 @@ def PhAlkEthOH_openff_wrapper(
     total_conformers=None,
     limit_atomic_species=None,
     max_force=None,
+    final_conformer_only=False,
 ):
     """
     This curates and processes the SPICE 114 dataset at the OpenFF level of theory into an hdf5 file.
@@ -52,6 +53,8 @@ def PhAlkEthOH_openff_wrapper(
         will be ignored. If not defined, no filtering by atomic species will be performed.
     max_force: float, optional, default=None
         The maximum force to allow in the dataset. Any conformers with forces greater than this value will be ignored.
+    final_conformer_only: bool, optional, default=False
+        If True, only the final conformer for each molecule will be processed. If False, all conformers will be processed.
 
     """
     from modelforge.curation.phalkethoh_curation import PhAlkEthOHCuration
@@ -70,6 +73,7 @@ def PhAlkEthOH_openff_wrapper(
         limit_atomic_species=limit_atomic_species,
         n_threads=1,
         max_force=max_force,
+        final_conformer_only=final_conformer_only,
     )
     print(f"Total records: {PhAlkEthOH_dataset.total_records}")
     print(f"Total conformers: {PhAlkEthOH_dataset.total_conformers}")
@@ -88,9 +92,9 @@ def main():
 
     # We'll want to provide some simple means of versioning
     # if we make updates to either the underlying dataset, curation modules, or parameters given to the code
-    version = "0"
+    version = "1"
     # version of the dataset to curate
-    version_select = f"v_{version}"
+    version_select = f"v_0"
 
     # curate dataset with 1000 total conformers, max of 10 conformers per record
     hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000.hdf5"
@@ -119,6 +123,35 @@ def main():
         max_force=1.0 * unit.hartree / unit.bohr,
     )
 
+    # curate dataset with 1000 total conformers, max of 10 conformers per record
+    hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_ntc_1000_minimal.hdf5"
+
+    PhAlkEthOH_openff_wrapper(
+        hdf5_file_name,
+        output_file_dir,
+        local_cache_dir,
+        force_download=False,
+        version_select=version_select,
+        max_records=1000,
+        total_conformers=1000,
+        max_conformers_per_record=10,
+        max_force=1.0 * unit.hartree / unit.bohr,
+        final_conformer_only=True,
+    )
+
+    # curate the full dataset
+    hdf5_file_name = f"PhAlkEthOH_openff_dataset_v{version}_minimal.hdf5"
+    print("total dataset")
+    PhAlkEthOH_openff_wrapper(
+        hdf5_file_name,
+        output_file_dir,
+        local_cache_dir,
+        force_download=False,
+        version_select=version_select,
+        max_force=1.0 * unit.hartree / unit.bohr,
+        final_conformer_only=True,
+    )
+
 
 if __name__ == "__main__":
     main()
diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
index 06ca005e..d4e96096 100644
--- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
+++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
@@ -1,6 +1,21 @@
 dataset: PhAlkEthOH
 latest: full_dataset_v0
 latest_test: nc_1000_v0
+full_dataset_v1:
+  version: 1
+  doi: 10.5281/zenodo.13450735
+  notes: removes high force conformers
+  gz_data_file:
+    length: 3300668359
+    md5: b051af374f3233e2925f7a1b96707772
+    name: PhAlkEthOH_dataset_v0.hdf5.gz
+  hdf5_data_file:
+    md5: f5d9dccb8e79a51892b671108bc57bde
+    name: PhAlkEthOH_dataset_v1.hdf5
+  processed_data_file:
+    md5: null
+    name: PhAlkEthOH_dataset_v1_processed.npz
+  url: https://zenodo.org/records/13450735/files/PhAlkEthOH_openff_dataset_v1.hdf5.gz
 full_dataset_v0:
   version: 0
   doi: 10.5281/zenodo.12174233

From 77a847772b98fc287aa19af25c722de4d1876558 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Thu, 29 Aug 2024 18:39:11 -0700
Subject: [PATCH 3/6] additional dataset versions added.

---
 modelforge/dataset/yaml_files/PhAlkEthOH.yaml | 33 +++++++++++++++++--
 1 file changed, 31 insertions(+), 2 deletions(-)

diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
index d4e96096..2f59df24 100644
--- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
+++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
@@ -1,6 +1,6 @@
 dataset: PhAlkEthOH
 latest: full_dataset_v0
-latest_test: nc_1000_v0
+latest_test: nc_1000_v1
 full_dataset_v1:
   version: 1
   doi: 10.5281/zenodo.13450735
@@ -8,7 +8,7 @@ full_dataset_v1:
   gz_data_file:
     length: 3300668359
     md5: b051af374f3233e2925f7a1b96707772
-    name: PhAlkEthOH_dataset_v0.hdf5.gz
+    name: PhAlkEthOH_dataset_v1.hdf5.gz
   hdf5_data_file:
     md5: f5d9dccb8e79a51892b671108bc57bde
     name: PhAlkEthOH_dataset_v1.hdf5
@@ -16,6 +16,35 @@ full_dataset_v1:
     md5: null
     name: PhAlkEthOH_dataset_v1_processed.npz
   url: https://zenodo.org/records/13450735/files/PhAlkEthOH_openff_dataset_v1.hdf5.gz
+nc_1000_v1:
+  version: 1
+  doi: 10.5281/zenodo.13560343
+  gz_data_file:
+    length: 2702091
+    md5: 76b421802bef68f858757dba41f3ea2e
+    name: PhAlkEthOH_dataset_v1_nc_1000.hdf5.gz
+  hdf5_data_file:
+    md5: 244eb8d1b3547b8da229fd1507fb4d4e
+    name: PhAlkEthOH_dataset_v1_nc_1000.hdf5
+  processed_data_file:
+    md5: null
+    name: PhAlkEthOH_dataset_v1_nc_1000_processed.npz
+  url: https://zenodo.org/records/13560343/files/PhAlkEthOH_openff_dataset_v1_ntc_1000.hdf5.gz
+full_dataset_min_v1:
+  version: 1
+  doi: 10.5281/zenodo.13561100
+  notes: removes high force configurations, only contains final optimized configuration
+  gz_data_file:
+    length: 31352642
+    md5: 205b0b7bc1858b1d3745480d9a29a770
+    name: PhAlkEthOH_dataset_v1_min.hdf5.gz
+  hdf5_data_file:
+    md5: 41cb40718f8872baa6c468ab08574d46
+    name: PhAlkEthOH_dataset_v1_min.hdf5
+  processed_data_file:
+    md5: null
+    name: PhAlkEthOH_dataset_v1_min_processed.npz
+  url: https://zenodo.org/records/13561100/files/PhAlkEthOH_openff_dataset_v1_min.hdf5.gz
 full_dataset_v0:
   version: 0
   doi: 10.5281/zenodo.12174233

From 05e29f2aabddf7bbf4c46c5e299e6c7806791067 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Thu, 29 Aug 2024 22:46:06 -0700
Subject: [PATCH 4/6] additional dataset versions added.

---
 modelforge/dataset/yaml_files/PhAlkEthOH.yaml | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
index 2f59df24..568e6c6b 100644
--- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
+++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
@@ -19,6 +19,7 @@ full_dataset_v1:
 nc_1000_v1:
   version: 1
   doi: 10.5281/zenodo.13560343
+  notes: removes high force conformers, 1000 conformers, max 10 per molecule
   gz_data_file:
     length: 2702091
     md5: 76b421802bef68f858757dba41f3ea2e
@@ -45,6 +46,21 @@ full_dataset_min_v1:
     md5: null
     name: PhAlkEthOH_dataset_v1_min_processed.npz
   url: https://zenodo.org/records/13561100/files/PhAlkEthOH_openff_dataset_v1_min.hdf5.gz
+nc_1000_min_v1:
+  version: 1
+  doi: 10.5281/zenodo.13576458
+  notes: removes high force conformers, 1000 conformers, only contains final optimized configuration
+  gz_data_file:
+    length: 3476870
+    md5: 7261f4738efd4bf8409268961837ba78
+    name: PhAlkEthOH_dataset_v1_nc_1000_min.hdf5.gz
+  hdf5_data_file:
+    md5: 5d347a78c6c3b45531870a05d5aab77e
+    name: PhAlkEthOH_dataset_v1_nc_1000_min.hdf5
+  processed_data_file:
+    md5: null
+    name: PhAlkEthOH_dataset_v1_nc_1000_min_processed.npz
+  url: https://zenodo.org/records/13576458/files/PhAlkEthOH_openff_dataset_v1_ntc_1000_min.hdf5.gz
 full_dataset_v0:
   version: 0
   doi: 10.5281/zenodo.12174233

From ce41a4977818901a4b96d6619997ebb9391c1e0a Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Thu, 29 Aug 2024 23:50:37 -0700
Subject: [PATCH 5/6] updated "latest"

---
 modelforge/dataset/yaml_files/PhAlkEthOH.yaml          | 2 +-
 modelforge/tests/data/dataset_defaults/phalkethoh.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
index 568e6c6b..7d8d186a 100644
--- a/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
+++ b/modelforge/dataset/yaml_files/PhAlkEthOH.yaml
@@ -1,5 +1,5 @@
 dataset: PhAlkEthOH
-latest: full_dataset_v0
+latest: full_dataset_v1
 latest_test: nc_1000_v1
 full_dataset_v1:
   version: 1
diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
index 60281c0c..436ad852 100644
--- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml
+++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
@@ -1,5 +1,5 @@
 [dataset]
 dataset_name = "PHALKETHOH"
-version_select = "nc_1000_v0"
+version_select = "full_dataset_min_v1"
 num_workers = 4
 pin_memory = true
\ No newline at end of file

From 031205caa47ad0680e1fcbab6cd8c7b600ed0727 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Fri, 30 Aug 2024 10:04:24 -0700
Subject: [PATCH 6/6] updated "latest"

---
 modelforge/tests/data/dataset_defaults/phalkethoh.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/modelforge/tests/data/dataset_defaults/phalkethoh.toml b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
index 436ad852..ddac0202 100644
--- a/modelforge/tests/data/dataset_defaults/phalkethoh.toml
+++ b/modelforge/tests/data/dataset_defaults/phalkethoh.toml
@@ -1,5 +1,5 @@
 [dataset]
 dataset_name = "PHALKETHOH"
-version_select = "full_dataset_min_v1"
+version_select = "nc_1000_v1"
 num_workers = 4
 pin_memory = true
\ No newline at end of file