From b69d87450d46b338c86a22c97394625f9c4917ab Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Sat, 26 Aug 2023 00:38:25 -0700
Subject: [PATCH 01/11] Initial framework for dataset curation and
 transformation to hdf5 files with units attached

---
 modelforge/curation/__init__.py     |   1 +
 modelforge/curation/qm9_curation.py | 314 ++++++++++++++++++++++++++++
 modelforge/curation/utils.py        |  62 ++++++
 modelforge/dataset/dataset.py       |  18 +-
 4 files changed, 388 insertions(+), 7 deletions(-)
 create mode 100644 modelforge/curation/__init__.py
 create mode 100644 modelforge/curation/qm9_curation.py
 create mode 100644 modelforge/curation/utils.py

diff --git a/modelforge/curation/__init__.py b/modelforge/curation/__init__.py
new file mode 100644
index 00000000..0f89000c
--- /dev/null
+++ b/modelforge/curation/__init__.py
@@ -0,0 +1 @@
+"""modelforge dataset curation."""
diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
new file mode 100644
index 00000000..e2bc158f
--- /dev/null
+++ b/modelforge/curation/qm9_curation.py
@@ -0,0 +1,314 @@
+import requests
+from loguru import logger
+import os
+from tqdm import tqdm
+
+from openff.units import unit, Quantity
+import pint
+import qcelemental as qcel
+
+import tarfile
+from modelforge.curation.utils import dict_to_hdf5
+import numpy as np
+
+
+class QM9_curation:
+    """
+    Routines to fetch and process QM9 dataset.
+
+    Parameters
+    ----------
+    hdf5_file_name: str, required
+        Name of the hdf5 file that will be generated
+    local_cache_dir: str, required, default=qm9_datafiles
+        Location to save downloaded dataset.
+
+    """
+
+    def __init__(self, hdf5_file_name: str, local_cache_dir: str = "qm9_datafiles"):
+        self.local_cache_dir = local_cache_dir
+        self.hdf5_file_name = hdf5_file_name
+
+        # define key pieces of information related to the dataset
+        self.dataset_description = {
+            "publication_doi": "10.1038/sdata.2014.22",
+            "collection_doi": "10.6084/m9.figshare.c.978904.v5",
+            "dataset_url": "https://springernature.figshare.com/articles/dataset/Data_for_6095_constitutional_isomers_of_C7H10O2/1057646/2",
+            "dataset_download_url": "https://ndownloader.figshare.com/files/3195389",
+            "dataset_filename": "dsgdb9nsd.xyz.tar.bz2",
+            "publication_citation": """Ramakrishnan, R., Dral, P., Rupp, M. et al. 
+                                        Quantum chemistry structures and properties of 134 kilo molecules. 
+                                        Sci Data 1, 140022 (2014). 
+                                        https://doi.org/10.1038/sdata.2014.22""",
+            "dataset_citation": """Ramakrishnan, Raghunathan; Dral, Pavlo; Rupp, Matthias; Anatole von Lilienfeld, O. (2014). 
+                                    Quantum chemistry structures and properties of 134 kilo molecules. 
+                                    figshare. Collection. https://doi.org/10.6084/m9.figshare.c.978904.v5""",
+            "description": """QM9 Dataset: Includes 133,885 organic molecules with up to nine heavy atoms (CONF). 
+                                All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.""",
+        }
+
+    def _download(
+        self, url: str, name: str, output_path: str, force_download=False
+    ) -> None:
+        """
+        Downloads the dataset tar file from figshare.
+
+        Parameters
+        ----------
+        url: str, required
+            Figshare url to the data downloader
+        name: str, required
+            Name of the file downloaded
+        output_path: str, required
+            Location to download the file to.
+        force_download: str, default=False
+            If False, the file is not downloaded if it already exists in the directory.
+            If True, the file will be downloaded even if it exists.
+
+        """
+
+        if not os.path.isfile(f"{output_path}/{name}") or force_download:
+            logger.debug(f"Downloading datafile from figshare to {output_path}/{name}.")
+            chunk_size = 512
+            # get the head of the request
+            head = requests.head(url)
+
+            # because the url is calling a downloader, instead of the direct file
+            # we can extract the file location and then fetch the length from this head
+            # this is only useful for the download bar status
+            temp_url = head.headers["location"].split("?")[0]
+            length = int(requests.head(temp_url).headers["Content-Length"])
+
+            r = requests.get(url, stream=True)
+
+            if not os.path.exists(output_path):
+                os.makedirs(output_path)
+
+            with open(f"{output_path}/{name}", "wb") as fd:
+                for chunk in tqdm(
+                    r.iter_content(chunk_size=chunk_size),
+                    ascii=True,
+                    desc="downloading",
+                    total=(int(length / chunk_size) + 1),
+                ):
+                    fd.write(chunk)
+        else:
+            logger.debug("Datafile exists, using cached file.")
+
+    def _extract(self, file_path: str, cache_directory: str) -> None:
+        """
+        Extract the contents of a tar.bz2 file.
+
+        Parameters
+        ----------
+        file_path: str, required
+            tar.bz2 to extract.
+        cache_directory: str, required
+            Location to save the contents from the tar.bz2 file
+        """
+        logger.debug(f"Extracting tar {file_path}.")
+
+        tar = tarfile.open(f"{file_path}", "r:bz2")
+        tar.extractall(cache_directory)
+        tar.close()
+
+    def _str_to_float(self, x: str) -> float:
+        """
+        Converts a string to float, fixing Mathematica style scientific notion.
+
+        For example converts str(1*^-6) to float(1e-6).
+
+        Parameters
+        ----------
+        x : str, required
+            String to process.
+
+        Returns
+        -------
+        float
+            Float value of the string.
+        """
+        xf = float(x.replace("*^", "e"))
+        return xf
+
+    def _parse_properties(self, line: str) -> dict:
+        """
+        Parses the property line in the xyz file.
+
+        Properties
+        ----------
+        line: str, required
+            String to parse following the description in the original manuscript (See tables 2 and 3)
+
+        Returns
+        -------
+        dict
+            Dictionary of properties, with units added when appropriate.
+        """
+
+        temp_prop = line.split()
+        # list of properties and their units
+        labels_prop_units = [
+            ("tag", None),
+            ("idx", None),
+            ("rotational constant A", unit.gigahertz),
+            ("rotational constant B", unit.gigahertz),
+            ("rotational constant C", unit.gigahertz),
+            ("dipole moment", unit.debye),
+            ("isotropic polarizability", unit.angstrom**3),
+            ("energy of homo", unit.hartree),
+            ("energy of lumo", unit.hartree),
+            ("gap", unit.hartree),
+            ("electronic spatial extent", unit.angstrom**2),
+            ("zero point vibrational energy", unit.hartree),
+            ("internal energy at 0K", unit.hartree),
+            ("internal energy at 298.15K", unit.hartree),
+            ("enthalpy at 298.15K", unit.hartree),
+            ("free energy at 298.15K", unit.hartree),
+            ("heat capacity at 298.15K", unit.calorie_per_mole / unit.kelvin),
+        ]
+
+        assert len(labels_prop_units) == len(temp_prop)
+
+        data = {}
+        for prop, label_prop_unit in zip(temp_prop, labels_prop_units):
+            label, prop_unit = label_prop_unit
+            if prop_unit is None:
+                data[label] = prop
+            else:
+                data[label] = self._str_to_float(prop) * prop_unit
+        return data
+
+    def _parse_xyzfile(self, file_name: str) -> dict:
+        """
+        Parses the file containing information for each molecule.
+
+        Parameters
+        ----------
+        file_name: str, required
+            Name of the file to parse
+
+        Returns
+        -------
+            dict:
+                Dict of parsed properties.
+
+        """
+        with open(file_name, "r") as file:
+            n_atoms = int(file.readline())
+            properties_temp = file.readline()
+            properties = self._parse_properties(properties_temp)
+            elements = []
+            atomic_numbers = []
+            geometry = []
+            charges = []
+            hvf = []
+            for i in range(n_atoms):
+                line = file.readline()
+                element, x, y, z, q = line.split()
+                elements.append(element)
+                atomic_numbers.append(qcel.periodictable.to_atomic_number(element))
+                temp = [
+                    self._str_to_float(x),
+                    self._str_to_float(y),
+                    self._str_to_float(z),
+                ]
+                geometry.append(temp)
+                charges.append(self._str_to_float(q))
+
+            hvf_temp = file.readline().split()
+
+            smiles = file.readline().split()
+            InChI = file.readline()
+
+            data = {}
+            data["name"] = file_name.split("/")[-1].split(".")[0]
+            data["smiles gdb-17"] = smiles[0]
+            data["smiles b3lyp"] = smiles[1]
+            data["inchi"] = InChI.split("\n")[0]
+            data["geometry"] = np.array(geometry) * unit.angstrom
+            # data["elements"] = np.array(elements, dtype=str)
+            data["atomic numbers"] = np.array(atomic_numbers)
+            data["charges"] = np.array(charges) * unit.elementary_charge
+
+            # remove the tag because it does not provide any useful information
+            properties.pop("tag")
+
+            # loop over remaining properties and add to the dict
+            for property, val in properties.items():
+                data[property] = val
+
+            for h in hvf_temp:
+                hvf.append(self._str_to_float(h))
+
+            data["harmonic vibrational frequencies"] = np.array(hvf) / unit.cm
+        return data
+
+    def _list_files(self, directory: str, extension: str) -> list:
+        """
+        Returns a list of files in a directory with a given extension.
+
+        Parameters
+        ----------
+        directory: str, required
+            Directory of interest.
+        extension: str, required
+            Only consider files with this given file extension
+
+        Returns
+        -------
+        list
+            List of files in the given directory with desired extension.
+
+        """
+
+        logger.debug(f"Gathering xyz data files in {directory}.")
+
+        files = []
+        for file in os.listdir(directory):
+            if file.endswith(extension):
+                files.append(file)
+        return files
+
+    def process(self, force_download: bool = False, unit_testing: bool = False) -> None:
+        """
+        Downloads the dataset and extracts relevant information.
+
+        Parameters
+        ----------
+        force_download: bool, optional, default=False
+            If the raw data_file is present in the local_cache_dir, the local copy will be used.
+            If True, this will force the software to download the data again, even if present.
+        unit_testing: bool, optional, default=False
+            If True, only a subset (first 10 records) of the dataset will be used.
+            Primarily meant to ensure unit tests can be completed in a reasonable time period.
+        Examples
+        --------
+        > qm9_data = QM9_curation(local_cache_dir='~/datasets/qm9_dataset')
+        > qm9_data.process()
+
+        """
+        name = self.dataset_description["dataset_filename"]
+        url = self.dataset_description["dataset_download_url"]
+
+        self._download(
+            url=url,
+            name=name,
+            output_path=self.local_cache_dir,
+            force_download=force_download,
+        )
+
+        self._extract(
+            file_path=f"{self.local_cache_dir}/{name}",
+            cache_directory=self.local_cache_dir,
+        )
+        files = self._list_files(directory=self.local_cache_dir, extension=".xyz")
+
+        self.data = []
+        for i, file in enumerate(tqdm(files, desc="processing", total=len(files))):
+            data_temp = self._parse_xyzfile(f"{self.local_cache_dir}/{file}")
+            self.data.append(data_temp)
+            if unit_testing:
+                if i > 10:
+                    break
+        dict_to_hdf5(self.hdf5_file_name, self.data, id_key="name")
diff --git a/modelforge/curation/utils.py b/modelforge/curation/utils.py
new file mode 100644
index 00000000..e5d88378
--- /dev/null
+++ b/modelforge/curation/utils.py
@@ -0,0 +1,62 @@
+import h5py
+import pint
+from openff.units import unit, Quantity
+import numpy as np
+from tqdm import tqdm
+
+# define new context for converting energy to energy/mol
+
+c = unit.Context("chem")
+c.add_transformation(
+    "[force] * [length]",
+    "[force] * [length]/[substance]",
+    lambda unit, x: x * unit.avogadro_constant,
+)
+c.add_transformation(
+    "[force] * [length]/[substance]",
+    "[force] * [length]",
+    lambda unit, x: x / unit.avogadro_constant,
+)
+unit.add_context(c)
+
+
+def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
+    """
+    Writes hdf5 file from dict.
+
+    Parameters
+    ----------
+    file_name: str, required
+        Name and path of hdf5 file to write.
+    data: list of dicts, required
+        List that contains dictionaries of properties for each molecule to write to file.
+    id_key: str, required
+        Name of the key in each dict that uniquely describes each molecule.
+
+    Examples
+    --------
+    > dict_to_hdf5(file_name='qm9.hdf5', data=data, id_key='name')
+    """
+    assert file_name.endswith(".hdf5")
+
+    dt = h5py.special_dtype(vlen=str)
+
+    with h5py.File(file_name, "w") as f:
+        for datapoint in tqdm(data):
+            record_name = datapoint[id_key]
+            group = f.create_group(record_name)
+            for key, val in datapoint.items():
+                if isinstance(val, pint.Quantity):
+                    val_m = val.m
+                    val_u = str(val.u)
+                else:
+                    val_m = val
+                    val_u = None
+                if isinstance(val_m, str):
+                    group.create_dataset(name=key, data=val_m, dtype=dt)
+                elif isinstance(val_m, (float, int)):
+                    group.create_dataset(name=key, data=val_m)
+                elif isinstance(val_m, np.ndarray):
+                    group.create_dataset(name=key, data=val_m, shape=val_m.shape)
+                if not val_u is None:
+                    group[key].attrs["units"] = val_u
diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 40af9452..731bf0b4 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 from typing import Callable, Dict, List, Optional, Tuple
 
 import numpy as np
@@ -126,6 +127,7 @@ def _from_hdf5(self) -> None:
 
         import h5py
         import tqdm
+        import shutil
 
         logger.debug("Reading in and processing hdf5 file ...")
         # initialize dict with empty lists
@@ -134,13 +136,15 @@ def _from_hdf5(self) -> None:
             data[value] = []
 
         logger.debug(f"Processing and extracting data from {self.raw_data_file}")
-        with gzip.open(self.raw_data_file, "rb") as gz_file, h5py.File(
-            gz_file, "r"
-        ) as hf:
-            logger.debug(f"n_entries: {len(hf.keys())}")
-            for mol in tqdm.tqdm(list(hf.keys())):
-                for value in self.properties_of_interest:
-                    data[value].append(hf[mol][value][()])
+        with gzip.open(self.raw_data_file, "rb") as gz_file:
+            with open(self.raw_data_file.replace(".gz", ""), "wb") as out_file:
+                shutil.copyfileobj(gz_file, out_file)
+                with h5py.File(self.raw_data_file.replace(".gz", ""), "r") as hf:
+                    logger.debug(f"n_entries: {len(hf.keys())}")
+                    for mol in tqdm.tqdm(list(hf.keys())):
+                        for value in self.properties_of_interest:
+                            data[value].append(hf[mol][value][()])
+
         self.hdf5data = data
 
     def _from_file_cache(self) -> Dict[str, List]:

From 6d63ba84b6b41b5f6876d3e726aac8c0958bc0e8 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Sat, 26 Aug 2023 00:44:44 -0700
Subject: [PATCH 02/11] Initial framework for dataset curation and
 transformation to hdf5 files with units attached. Revised code for opening
 hdf5 files to be substantially more efficient.

---
 modelforge/curation/qm9_curation.py | 2 +-
 modelforge/curation/utils.py        | 2 +-
 modelforge/dataset/dataset.py       | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
index e2bc158f..7d0882b8 100644
--- a/modelforge/curation/qm9_curation.py
+++ b/modelforge/curation/qm9_curation.py
@@ -188,7 +188,7 @@ def _parse_xyzfile(self, file_name: str) -> dict:
         file_name: str, required
             Name of the file to parse
 
-        Returns
+        Return
         -------
             dict:
                 Dict of parsed properties.
diff --git a/modelforge/curation/utils.py b/modelforge/curation/utils.py
index e5d88378..ab9a6438 100644
--- a/modelforge/curation/utils.py
+++ b/modelforge/curation/utils.py
@@ -17,7 +17,7 @@
     "[force] * [length]",
     lambda unit, x: x / unit.avogadro_constant,
 )
-unit.add_context(c)
+unit.add_context(c
 
 
 def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 731bf0b4..208c6abb 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -136,6 +136,8 @@ def _from_hdf5(self) -> None:
             data[value] = []
 
         logger.debug(f"Processing and extracting data from {self.raw_data_file}")
+        # this will create an unzipped file which we can then load in
+        # this is substantially faster than passing gz_file directly to h5py.File()
         with gzip.open(self.raw_data_file, "rb") as gz_file:
             with open(self.raw_data_file.replace(".gz", ""), "wb") as out_file:
                 shutil.copyfileobj(gz_file, out_file)

From dbcc1ea40f125668c59fe7e55a3c818234ad777d Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Mon, 28 Aug 2023 10:06:47 -0700
Subject: [PATCH 03/11] File being downloaded from gdrive didn't have .gz
 extension, causing my logic in the reading to fail (it tried to overwrite the
 compressed file with uncompressed while streaming, which didn't work well).

---
 modelforge/curation/qm9_curation.py | 41 +++++++++++++++++++++++------
 modelforge/curation/utils.py        |  6 +++--
 modelforge/dataset/qm9.py           |  2 +-
 modelforge/dataset/utils.py         |  2 +-
 4 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
index 7d0882b8..390d7182 100644
--- a/modelforge/curation/qm9_curation.py
+++ b/modelforge/curation/qm9_curation.py
@@ -14,7 +14,17 @@
 
 class QM9_curation:
     """
-    Routines to fetch and process QM9 dataset.
+    Routines to fetch and process the QM9 dataset.
+
+    The QM9 dataset includes 133,885 organic molecules with up to nine heavy atoms (CONF).
+    All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.
+
+    Citation: Ramakrishnan, R., Dral, P., Rupp, M. et al.
+                Quantum chemistry structures and properties of 134 kilo molecules.
+                Sci Data 1, 140022 (2014).
+                https://doi.org/10.1038/sdata.2014.22
+
+    DOI of dataset: 10.6084/m9.figshare.c.978904.v5
 
     Parameters
     ----------
@@ -29,7 +39,11 @@ def __init__(self, hdf5_file_name: str, local_cache_dir: str = "qm9_datafiles"):
         self.local_cache_dir = local_cache_dir
         self.hdf5_file_name = hdf5_file_name
 
-        # define key pieces of information related to the dataset
+        # define key pieces of information related to the dataset in a dict.
+        # dataset_download_url and dataset_filename are the two
+        # pieces of information used by the code to fetch the data.
+        # all other data is metadata that will be used to generate a README to go along with
+        # the HDF5 dataset.
         self.dataset_description = {
             "publication_doi": "10.1038/sdata.2014.22",
             "collection_doi": "10.6084/m9.figshare.c.978904.v5",
@@ -148,7 +162,8 @@ def _parse_properties(self, line: str) -> dict:
 
         temp_prop = line.split()
         # list of properties and their units
-        labels_prop_units = [
+        # in the order they appear in the line
+        labels_and_units = [
             ("tag", None),
             ("idx", None),
             ("rotational constant A", unit.gigahertz),
@@ -168,11 +183,11 @@ def _parse_properties(self, line: str) -> dict:
             ("heat capacity at 298.15K", unit.calorie_per_mole / unit.kelvin),
         ]
 
-        assert len(labels_prop_units) == len(temp_prop)
+        assert len(labels_and_units) == len(temp_prop)
 
         data = {}
-        for prop, label_prop_unit in zip(temp_prop, labels_prop_units):
-            label, prop_unit = label_prop_unit
+        for prop, label_and_unit in zip(temp_prop, labels_and_units):
+            label, prop_unit = label_and_unit
             if prop_unit is None:
                 data[label] = prop
             else:
@@ -188,7 +203,7 @@ def _parse_xyzfile(self, file_name: str) -> dict:
         file_name: str, required
             Name of the file to parse
 
-        Return
+        Returns
         -------
             dict:
                 Dict of parsed properties.
@@ -227,6 +242,9 @@ def _parse_xyzfile(self, file_name: str) -> dict:
             data["smiles b3lyp"] = smiles[1]
             data["inchi"] = InChI.split("\n")[0]
             data["geometry"] = np.array(geometry) * unit.angstrom
+            # Element symbols are converted to atomic numbers
+            # including an array of strings causes complications
+            # when writing the hdf5 file.
             # data["elements"] = np.array(elements, dtype=str)
             data["atomic numbers"] = np.array(atomic_numbers)
             data["charges"] = np.array(charges) * unit.elementary_charge
@@ -272,7 +290,7 @@ def _list_files(self, directory: str, extension: str) -> list:
 
     def process(self, force_download: bool = False, unit_testing: bool = False) -> None:
         """
-        Downloads the dataset and extracts relevant information.
+        Downloads the dataset, extracts relevant information, and writes an hdf5 file.
 
         Parameters
         ----------
@@ -291,6 +309,7 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
         name = self.dataset_description["dataset_filename"]
         url = self.dataset_description["dataset_download_url"]
 
+        # download the dataset
         self._download(
             url=url,
             name=name,
@@ -298,12 +317,16 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
             force_download=force_download,
         )
 
+        # untar the dataset
         self._extract(
             file_path=f"{self.local_cache_dir}/{name}",
             cache_directory=self.local_cache_dir,
         )
+
+        # list the files in the directory to examine
         files = self._list_files(directory=self.local_cache_dir, extension=".xyz")
 
+        # parse the information in each datat file, saving to a list of dicts, data
         self.data = []
         for i, file in enumerate(tqdm(files, desc="processing", total=len(files))):
             data_temp = self._parse_xyzfile(f"{self.local_cache_dir}/{file}")
@@ -311,4 +334,6 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
             if unit_testing:
                 if i > 10:
                     break
+
+        # generate the hdf5 file from the list of dicts
         dict_to_hdf5(self.hdf5_file_name, self.data, id_key="name")
diff --git a/modelforge/curation/utils.py b/modelforge/curation/utils.py
index ab9a6438..6e6297d9 100644
--- a/modelforge/curation/utils.py
+++ b/modelforge/curation/utils.py
@@ -17,12 +17,14 @@
     "[force] * [length]",
     lambda unit, x: x / unit.avogadro_constant,
 )
-unit.add_context(c
+unit.add_context(c)
 
 
 def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
     """
-    Writes hdf5 file from dict.
+    Writes hdf5 file from a list of dicts.
+
+    This will write units, if provided as attributes.
 
     Parameters
     ----------
diff --git a/modelforge/dataset/qm9.py b/modelforge/dataset/qm9.py
index a935abd2..b6fa872b 100644
--- a/modelforge/dataset/qm9.py
+++ b/modelforge/dataset/qm9.py
@@ -67,7 +67,7 @@ def __init__(
             dataset_name = f"{dataset_name}_subset"
 
         super().__init__(
-            f"{local_cache_dir}/{dataset_name}_cache.hdf5",
+            f"{local_cache_dir}/{dataset_name}_cache.hdf5.gz",
             f"{local_cache_dir}/{dataset_name}_processed.npz",
         )
         self.dataset_name = dataset_name
diff --git a/modelforge/dataset/utils.py b/modelforge/dataset/utils.py
index c9ea2652..71b49158 100644
--- a/modelforge/dataset/utils.py
+++ b/modelforge/dataset/utils.py
@@ -139,7 +139,7 @@ def _download_from_gdrive(id: str, raw_dataset_file: str):
 
     Examples
     --------
-    >>> _download_from_gdrive("1v2gV3sG9JhMZ5QZn3gFB9j5ZIs0Xjxz8", "data_file.hdf5")
+    >>> _download_from_gdrive("1v2gV3sG9JhMZ5QZn3gFB9j5ZIs0Xjxz8", "data_file.hdf5.gz")
     """
     import gdown
 

From 50e8d74cea70e5b00d6a4b6a0e5b55017455515b Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Mon, 28 Aug 2023 20:18:56 -0700
Subject: [PATCH 04/11] Fixing parsing issue of InChI strings.

---
 modelforge/curation/qm9_curation.py | 147 +++++++++++++++++++++-------
 1 file changed, 110 insertions(+), 37 deletions(-)

diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
index 390d7182..f9398f7c 100644
--- a/modelforge/curation/qm9_curation.py
+++ b/modelforge/curation/qm9_curation.py
@@ -14,8 +14,9 @@
 
 class QM9_curation:
     """
-    Routines to fetch and process the QM9 dataset.
+    Routines to fetch and process the QM9 dataset into a curated hdf5 file.
 
+    Dataset description:
     The QM9 dataset includes 133,885 organic molecules with up to nine heavy atoms (CONF).
     All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.
 
@@ -24,48 +25,73 @@ class QM9_curation:
                 Sci Data 1, 140022 (2014).
                 https://doi.org/10.1038/sdata.2014.22
 
-    DOI of dataset: 10.6084/m9.figshare.c.978904.v5
+    DOI for dataset: 10.6084/m9.figshare.c.978904.v5
 
     Parameters
     ----------
     hdf5_file_name: str, required
-        Name of the hdf5 file that will be generated
-    local_cache_dir: str, required, default=qm9_datafiles
+        Name of the hdf5 file that will be generated.
+    output_file_path: str, optional, default='./'
+        Path to write the output hdf5 file.
+    local_cache_dir: str, optional, default='./qm9_datafiles'
         Location to save downloaded dataset.
 
+    Examples
+    --------
+    >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset')
+    >>> qm9_data.process()
+
     """
 
-    def __init__(self, hdf5_file_name: str, local_cache_dir: str = "qm9_datafiles"):
+    def __init__(
+        self,
+        hdf5_file_name: str,
+        output_file_path: str = "./",
+        local_cache_dir: str = "./qm9_datafiles",
+    ):
         self.local_cache_dir = local_cache_dir
+        self.output_file_path = output_file_path
         self.hdf5_file_name = hdf5_file_name
 
-        # define key pieces of information related to the dataset in a dict.
-        # dataset_download_url and dataset_filename are the two
-        # pieces of information used by the code to fetch the data.
-        # all other data is metadata that will be used to generate a README to go along with
+        # Below, we define key pieces of information related to the dataset in the form of a dict.
+        # `dataset_download_url` and `dataset_filename` are used by the code to fetch the data.
+        # All other data is metadata that will be used to generate a README to go along with
         # the HDF5 dataset.
         self.dataset_description = {
             "publication_doi": "10.1038/sdata.2014.22",
-            "collection_doi": "10.6084/m9.figshare.c.978904.v5",
-            "dataset_url": "https://springernature.figshare.com/articles/dataset/Data_for_6095_constitutional_isomers_of_C7H10O2/1057646/2",
+            "figshare_dataset_doi": "10.6084/m9.figshare.c.978904.v5",
+            "figshare_dataset_url": "https://springernature.figshare.com/articles/dataset/Data_for_6095_constitutional_isomers_of_C7H10O2/1057646/2",
             "dataset_download_url": "https://ndownloader.figshare.com/files/3195389",
             "dataset_filename": "dsgdb9nsd.xyz.tar.bz2",
-            "publication_citation": """Ramakrishnan, R., Dral, P., Rupp, M. et al. 
-                                        Quantum chemistry structures and properties of 134 kilo molecules. 
-                                        Sci Data 1, 140022 (2014). 
-                                        https://doi.org/10.1038/sdata.2014.22""",
-            "dataset_citation": """Ramakrishnan, Raghunathan; Dral, Pavlo; Rupp, Matthias; Anatole von Lilienfeld, O. (2014). 
-                                    Quantum chemistry structures and properties of 134 kilo molecules. 
-                                    figshare. Collection. https://doi.org/10.6084/m9.figshare.c.978904.v5""",
-            "description": """QM9 Dataset: Includes 133,885 organic molecules with up to nine heavy atoms (CONF). 
-                                All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.""",
+            "publication_citation": """
+                Ramakrishnan, R., Dral, P., Rupp, M. et al. 
+                    Quantum chemistry structures and properties of 134 kilo molecules. 
+                    Sci Data 1, 140022 (2014). 
+                    https://doi.org/10.1038/sdata.2014.22
+                """,
+            "dataset_citation": """
+                    Ramakrishnan, Raghunathan; Dral, Pavlo; Rupp, Matthias; Anatole von Lilienfeld, O. (2014). 
+                    Quantum chemistry structures and properties of 134 kilo molecules. 
+                    figshare. Collection. https://doi.org/10.6084/m9.figshare.c.978904.v5
+                """,
+            "description": """
+                QM9 Dataset: Includes 133,885 organic molecules with up to nine heavy atoms (CONF). 
+                ll properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.
+                """,
         }
 
+    def _mkdir(self, path: str) -> None:
+        if not os.path.exists(path):
+            try:
+                os.makedirs(path)
+            except Exception:
+                print("Could not create directory {path}.")
+
     def _download(
         self, url: str, name: str, output_path: str, force_download=False
     ) -> None:
         """
-        Downloads the dataset tar file from figshare.
+        Downloads the dataset tar.bz2 file from figshare.
 
         Parameters
         ----------
@@ -84,19 +110,21 @@ def _download(
         if not os.path.isfile(f"{output_path}/{name}") or force_download:
             logger.debug(f"Downloading datafile from figshare to {output_path}/{name}.")
             chunk_size = 512
+
             # get the head of the request
             head = requests.head(url)
 
-            # because the url is calling a downloader, instead of the direct file
-            # we can extract the file location and then fetch the length from this head
-            # this is only useful for the download bar status
+            # because the url on figshare calls downloader, instead of the direct file,
+            # we need to figure out what the original file is to know how big it is
+            # here we will parse the header info to get the file the downloader links to
+            # and then get the head info from this link to fetch the length
+            # this is not actually necessary, but useful for updating download status bar
             temp_url = head.headers["location"].split("?")[0]
             length = int(requests.head(temp_url).headers["Content-Length"])
 
             r = requests.get(url, stream=True)
 
-            if not os.path.exists(output_path):
-                os.makedirs(output_path)
+            self._mkdir(output_path)
 
             with open(f"{output_path}/{name}", "wb") as fd:
                 for chunk in tqdm(
@@ -106,7 +134,7 @@ def _download(
                     total=(int(length / chunk_size) + 1),
                 ):
                     fd.write(chunk)
-        else:
+        else:  # if the file exists and we don't set force_download to True, just use the cached version
             logger.debug("Datafile exists, using cached file.")
 
     def _extract(self, file_path: str, cache_directory: str) -> None:
@@ -128,9 +156,9 @@ def _extract(self, file_path: str, cache_directory: str) -> None:
 
     def _str_to_float(self, x: str) -> float:
         """
-        Converts a string to float, fixing Mathematica style scientific notion.
+        Converts a string to float, changing Mathematica style scientific notion to python style.
 
-        For example converts str(1*^-6) to float(1e-6).
+        For example, this will convert str(1*^-6) to float(1e-6).
 
         Parameters
         ----------
@@ -147,12 +175,13 @@ def _str_to_float(self, x: str) -> float:
 
     def _parse_properties(self, line: str) -> dict:
         """
-        Parses the property line in the xyz file.
+        Parses the line in the xyz file that contains property information.
 
         Properties
         ----------
         line: str, required
-            String to parse following the description in the original manuscript (See tables 2 and 3)
+            String to parse that contains property information. The structure of this line
+            following the description in the original manuscript (See tables 2 and 3).
 
         Returns
         -------
@@ -161,8 +190,8 @@ def _parse_properties(self, line: str) -> dict:
         """
 
         temp_prop = line.split()
-        # list of properties and their units
-        # in the order they appear in the line
+        # list of properties and their units in the order they appear in the file.
+
         labels_and_units = [
             ("tag", None),
             ("idx", None),
@@ -198,6 +227,9 @@ def _parse_xyzfile(self, file_name: str) -> dict:
         """
         Parses the file containing information for each molecule.
 
+        Structure of the file (based on tables 2 and 3 of the original manuscript):
+
+
         Parameters
         ----------
         file_name: str, required
@@ -208,7 +240,40 @@ def _parse_xyzfile(self, file_name: str) -> dict:
             dict:
                 Dict of parsed properties.
 
+        File format info
+        ----------------
+
+        Line            Content
+        1               Number of atoms, n_a
+        2               Scalar properties (see below)
+        3,...mn_a+2     Element type, coords (x,y,z \AA) Mulliken partial charges (in e)
+        n_a+3           Harmonic vibrational frequencies (3n_a-5 or 3n_a-6 in cm^-1)
+        n_a+4           SMILES strings from GDB-17 and B3LYP relaxation
+        n_a+5           InChI strings for Corina and B3LYP geometries
+
+        Scalar properties:
+
+        #   Unit        Description
+        1   N/A         gdb9 string to facilitate extraction
+        2   N/A         Consecutive, 1-based integer identifier
+        3   GHz         Rotational constant A
+        4   GHz         Rotational constant B
+        5   GHz         Rotational constant C
+        6   D           Dipole moment
+        7   \AA^3       Isotropic polarizability
+        8   Ha          Energy of HOMO
+        9   Ha          Energy of LUMO
+        10  Ha          LUMO-HOMO gap
+        11  \AA^2       Electronic spatial extent
+        12  Ha          Zero point vibrational energy
+        13  Ha          Internal energy at 0K
+        14  Ha          Internal energy at 298.15K
+        15  Ha          Enthalpy at 298.15K
+        16  Ha          Free energy at 298.15K
+        17  cal/mol/K   Heat capacity at 298.15K
+
         """
+
         with open(file_name, "r") as file:
             n_atoms = int(file.readline())
             properties_temp = file.readline()
@@ -240,7 +305,8 @@ def _parse_xyzfile(self, file_name: str) -> dict:
             data["name"] = file_name.split("/")[-1].split(".")[0]
             data["smiles gdb-17"] = smiles[0]
             data["smiles b3lyp"] = smiles[1]
-            data["inchi"] = InChI.split("\n")[0]
+            data["inchi Corina"] = InChI.split("\n")[0].split()[0].replace("InChI=", "")
+            data["inchi B3LYP"] = InChI.split("\n")[0].split()[1].replace("InChI=", "")
             data["geometry"] = np.array(geometry) * unit.angstrom
             # Element symbols are converted to atomic numbers
             # including an array of strings causes complications
@@ -286,6 +352,7 @@ def _list_files(self, directory: str, extension: str) -> list:
         for file in os.listdir(directory):
             if file.endswith(extension):
                 files.append(file)
+        files.sort()
         return files
 
     def process(self, force_download: bool = False, unit_testing: bool = False) -> None:
@@ -300,10 +367,11 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
         unit_testing: bool, optional, default=False
             If True, only a subset (first 10 records) of the dataset will be used.
             Primarily meant to ensure unit tests can be completed in a reasonable time period.
+
         Examples
         --------
-        > qm9_data = QM9_curation(local_cache_dir='~/datasets/qm9_dataset')
-        > qm9_data.process()
+        >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset')
+        >>> qm9_data.process()
 
         """
         name = self.dataset_description["dataset_filename"]
@@ -335,5 +403,10 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
                 if i > 10:
                     break
 
+        self._mkdir(self.output_file_path)
+
+        full_output_path = f"{self.output_file_path}/{self.hdf5_file_name}"
         # generate the hdf5 file from the list of dicts
-        dict_to_hdf5(self.hdf5_file_name, self.data, id_key="name")
+        logger.debug("Writing HDF5 file.")
+
+        dict_to_hdf5(full_output_path, self.data, id_key="name")

From a1f04adfb242397e4a9046d9a2d5789a8eb5f23c Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Tue, 29 Aug 2023 00:07:38 -0700
Subject: [PATCH 05/11] Added unit conversion.  By default, units will be
 converted to nm and kJ/mol (can be toggled on or off).  output units are
 hardcoded as a dict in the constructor.  The goal is to capture provenance,
 not make these necessarily very mutable, so hard-coding should be fine.

---
 modelforge/curation/qm9_curation.py | 38 ++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
index f9398f7c..ee3e876b 100644
--- a/modelforge/curation/qm9_curation.py
+++ b/modelforge/curation/qm9_curation.py
@@ -3,12 +3,13 @@
 import os
 from tqdm import tqdm
 
+from typing import Optional
 from openff.units import unit, Quantity
 import pint
 import qcelemental as qcel
 
 import tarfile
-from modelforge.curation.utils import dict_to_hdf5
+from modelforge.curation.utils import *
 import numpy as np
 
 
@@ -35,6 +36,9 @@ class QM9_curation:
         Path to write the output hdf5 file.
     local_cache_dir: str, optional, default='./qm9_datafiles'
         Location to save downloaded dataset.
+    convert_units: bool, optional, default=True
+        Convert from [angstrom, hartree] (i.e., source units)
+        to [nanometer, kJ/mol]
 
     Examples
     --------
@@ -46,12 +50,14 @@ class QM9_curation:
     def __init__(
         self,
         hdf5_file_name: str,
-        output_file_path: str = "./",
-        local_cache_dir: str = "./qm9_datafiles",
+        output_file_path: Optional[str] = "./",
+        local_cache_dir: Optional[str] = "./qm9_datafiles",
+        convert_units: Optional[bool] = True,
     ):
         self.local_cache_dir = local_cache_dir
         self.output_file_path = output_file_path
         self.hdf5_file_name = hdf5_file_name
+        self.convert_units = convert_units
 
         # Below, we define key pieces of information related to the dataset in the form of a dict.
         # `dataset_download_url` and `dataset_filename` are used by the code to fetch the data.
@@ -79,6 +85,20 @@ def __init__(
                 ll properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.
                 """,
         }
+        # if convert_units is True we will
+        # convert the following units
+        self.unit_output_dict = {
+            "geometry": unit.nanometer,
+            "energy of homo": unit.kilojoule_per_mole,
+            "energy of lumo": unit.kilojoule_per_mole,
+            "gap": unit.kilojoule_per_mole,
+            "zero point vibrational energy": unit.kilojoule_per_mole,
+            "internal energy at 0K": unit.kilojoule_per_mole,
+            "internal energy at 298.15K": unit.kilojoule_per_mole,
+            "enthalpy at 298.15K": unit.kilojoule_per_mole,
+            "free energy at 298.15K": unit.kilojoule_per_mole,
+            "heat capacity at 298.15K": unit.kilojoule_per_mole / unit.kelvin,
+        }
 
     def _mkdir(self, path: str) -> None:
         if not os.path.exists(path):
@@ -326,6 +346,18 @@ def _parse_xyzfile(self, file_name: str) -> dict:
                 hvf.append(self._str_to_float(h))
 
             data["harmonic vibrational frequencies"] = np.array(hvf) / unit.cm
+
+            # if unit outputs were defined perform conversion
+            if self.convert_units:
+                for key in data.keys():
+                    if key in self.unit_output_dict.keys():
+                        try:
+                            data[key] = data[key].to(self.unit_output_dict[key], "chem")
+                        except Exception:
+                            print(
+                                f"could not convert {key} with units {key.u} to {self.unit_output_dict[key]}"
+                            )
+
         return data
 
     def _list_files(self, directory: str, extension: str) -> list:

From f4f475f82f829e564a7181aff7c2bb23454d44fb Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Tue, 29 Aug 2023 15:21:50 -0700
Subject: [PATCH 06/11] Added unit tests for hdf5 writer and qm9_curation.

---
 modelforge/curation/qm9_curation.py |  66 +++---
 modelforge/curation/utils.py        |  35 +--
 modelforge/tests/test_curation.py   | 327 ++++++++++++++++++++++++++++
 3 files changed, 381 insertions(+), 47 deletions(-)
 create mode 100644 modelforge/tests/test_curation.py

diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
index ee3e876b..7503ef81 100644
--- a/modelforge/curation/qm9_curation.py
+++ b/modelforge/curation/qm9_curation.py
@@ -17,7 +17,6 @@ class QM9_curation:
     """
     Routines to fetch and process the QM9 dataset into a curated hdf5 file.
 
-    Dataset description:
     The QM9 dataset includes 133,885 organic molecules with up to nine heavy atoms (CONF).
     All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.
 
@@ -266,7 +265,7 @@ def _parse_xyzfile(self, file_name: str) -> dict:
         Line            Content
         1               Number of atoms, n_a
         2               Scalar properties (see below)
-        3,...mn_a+2     Element type, coords (x,y,z \AA) Mulliken partial charges (in e)
+        3,...mn_a+2     Element type, coords (x,y,z Ang) Mulliken partial charges (in e)
         n_a+3           Harmonic vibrational frequencies (3n_a-5 or 3n_a-6 in cm^-1)
         n_a+4           SMILES strings from GDB-17 and B3LYP relaxation
         n_a+5           InChI strings for Corina and B3LYP geometries
@@ -280,11 +279,11 @@ def _parse_xyzfile(self, file_name: str) -> dict:
         4   GHz         Rotational constant B
         5   GHz         Rotational constant C
         6   D           Dipole moment
-        7   \AA^3       Isotropic polarizability
+        7   Ang^3       Isotropic polarizability
         8   Ha          Energy of HOMO
         9   Ha          Energy of LUMO
         10  Ha          LUMO-HOMO gap
-        11  \AA^2       Electronic spatial extent
+        11  Ang^2       Electronic spatial extent
         12  Ha          Zero point vibrational energy
         13  Ha          Internal energy at 0K
         14  Ha          Internal energy at 298.15K
@@ -387,6 +386,37 @@ def _list_files(self, directory: str, extension: str) -> list:
         files.sort()
         return files
 
+    def _process_downloaded(
+        self, local_path_to_tar: str, name: str, unit_testing: bool
+    ):
+        # untar the dataset
+        self._extract(
+            file_path=f"{local_path_to_tar}/{name}",
+            cache_directory=self.local_cache_dir,
+        )
+
+        # list the files in the directory to examine
+        files = self._list_files(directory=self.local_cache_dir, extension=".xyz")
+
+        # parse the information in each datat file, saving to a list of dicts, data
+        self.data = []
+        for i, file in enumerate(tqdm(files, desc="processing", total=len(files))):
+            # first 10 records
+            if unit_testing:
+                if i > 9:
+                    break
+
+            data_temp = self._parse_xyzfile(f"{self.local_cache_dir}/{file}")
+            self.data.append(data_temp)
+
+        self._mkdir(self.output_file_path)
+
+        full_output_path = f"{self.output_file_path}/{self.hdf5_file_name}"
+
+        # generate the hdf5 file from the list of dicts
+        logger.debug("Writing HDF5 file.")
+        dict_to_hdf5(full_output_path, self.data, id_key="name")
+
     def process(self, force_download: bool = False, unit_testing: bool = False) -> None:
         """
         Downloads the dataset, extracts relevant information, and writes an hdf5 file.
@@ -416,29 +446,5 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
             output_path=self.local_cache_dir,
             force_download=force_download,
         )
-
-        # untar the dataset
-        self._extract(
-            file_path=f"{self.local_cache_dir}/{name}",
-            cache_directory=self.local_cache_dir,
-        )
-
-        # list the files in the directory to examine
-        files = self._list_files(directory=self.local_cache_dir, extension=".xyz")
-
-        # parse the information in each datat file, saving to a list of dicts, data
-        self.data = []
-        for i, file in enumerate(tqdm(files, desc="processing", total=len(files))):
-            data_temp = self._parse_xyzfile(f"{self.local_cache_dir}/{file}")
-            self.data.append(data_temp)
-            if unit_testing:
-                if i > 10:
-                    break
-
-        self._mkdir(self.output_file_path)
-
-        full_output_path = f"{self.output_file_path}/{self.hdf5_file_name}"
-        # generate the hdf5 file from the list of dicts
-        logger.debug("Writing HDF5 file.")
-
-        dict_to_hdf5(full_output_path, self.data, id_key="name")
+        # process the rest of the dataset
+        self._process_downloaded(self.local_cache_dir, name, unit_testing)
diff --git a/modelforge/curation/utils.py b/modelforge/curation/utils.py
index 6e6297d9..4c4d216e 100644
--- a/modelforge/curation/utils.py
+++ b/modelforge/curation/utils.py
@@ -22,9 +22,9 @@
 
 def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
     """
-    Writes hdf5 file from a list of dicts.
+    Writes an hdf5 file from a list of dicts.
 
-    This will write units, if provided as attributes.
+    This will include units, if provided as attributes.
 
     Parameters
     ----------
@@ -33,7 +33,7 @@ def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
     data: list of dicts, required
         List that contains dictionaries of properties for each molecule to write to file.
     id_key: str, required
-        Name of the key in each dict that uniquely describes each molecule.
+        Name of the key in the dicts that uniquely describes each record.
 
     Examples
     --------
@@ -48,17 +48,18 @@ def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
             record_name = datapoint[id_key]
             group = f.create_group(record_name)
             for key, val in datapoint.items():
-                if isinstance(val, pint.Quantity):
-                    val_m = val.m
-                    val_u = str(val.u)
-                else:
-                    val_m = val
-                    val_u = None
-                if isinstance(val_m, str):
-                    group.create_dataset(name=key, data=val_m, dtype=dt)
-                elif isinstance(val_m, (float, int)):
-                    group.create_dataset(name=key, data=val_m)
-                elif isinstance(val_m, np.ndarray):
-                    group.create_dataset(name=key, data=val_m, shape=val_m.shape)
-                if not val_u is None:
-                    group[key].attrs["units"] = val_u
+                if key != id_key:
+                    if isinstance(val, pint.Quantity):
+                        val_m = val.m
+                        val_u = str(val.u)
+                    else:
+                        val_m = val
+                        val_u = None
+                    if isinstance(val_m, str):
+                        group.create_dataset(name=key, data=val_m, dtype=dt)
+                    elif isinstance(val_m, (float, int)):
+                        group.create_dataset(name=key, data=val_m)
+                    elif isinstance(val_m, np.ndarray):
+                        group.create_dataset(name=key, data=val_m, shape=val_m.shape)
+                    if not val_u is None:
+                        group[key].attrs["units"] = val_u
diff --git a/modelforge/tests/test_curation.py b/modelforge/tests/test_curation.py
new file mode 100644
index 00000000..ce4bc20b
--- /dev/null
+++ b/modelforge/tests/test_curation.py
@@ -0,0 +1,327 @@
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+import pytest
+from openff.units import unit, Quantity
+import pint
+import importlib_resources as resources
+
+from modelforge.curation.utils import *
+
+from modelforge.curation.qm9_curation import *
+
+
+@pytest.fixture(scope="session")
+def prep_temp_dir(tmp_path_factory):
+    fn = tmp_path_factory.mktemp("hdf5_data")
+    return fn
+
+    # generate test data into a temporary path
+
+
+def test_dict_to_hdf5(prep_temp_dir):
+    # generate an hdf5 file from simple test data
+    # then read it in and see that we can reconstruct the same data
+    file_path = str(prep_temp_dir)
+    test_data = [
+        {
+            "name": "test1",
+            "energy": 123 * unit.hartree,
+            "geometry": np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) * unit.angstrom,
+        },
+        {
+            "name": "test2",
+            "energy": 456 * unit.hartree,
+            "geometry": np.array([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]]) * unit.angstrom,
+        },
+    ]
+    file_name_path = file_path + "/test.hdf5"
+    dict_to_hdf5(file_name=file_name_path, data=test_data, id_key="name")
+
+    # check we wrote the file
+    assert os.path.isfile(file_name_path)
+
+    # read in the hdf5 file
+    records = []
+    with h5py.File(file_name_path, "r") as hf:
+        test_names = list(hf.keys())
+
+        # validate names
+        assert test_names == ["test1", "test2"]
+
+        for name in test_names:
+            temp_record = {}
+            temp_record["name"] = name
+            properties = list(hf[name].keys())
+
+            for property in properties:
+                # validate properties name
+                assert property in ["energy", "geometry"]
+                if "units" in hf[name][property].attrs:
+                    u = hf[name][property].attrs["units"]
+                    temp_record[property] = hf[name][property][
+                        ()
+                    ] * unit.parse_expression(u)
+
+                else:
+                    temp_record[property] = hf[name][property][()]
+
+            records.append(temp_record)
+
+    # loop over reconstructed list of dictionaries and compare contents
+    for i in range(len(records)):
+        for key in test_data[i].keys():
+            if isinstance(records[i][key], pint.Quantity):
+                record_m = records[i][key].m
+                test_data_m = test_data[i][key].m
+
+                if isinstance(record_m, np.ndarray):
+                    print(record_m, test_data_m)
+                    assert np.all(record_m == test_data_m)
+                else:
+                    assert record_m == test_data_m
+            else:
+                assert records[i][key] == test_data[i][key]
+
+
+def test_qm9_curation_helper_functions(prep_temp_dir):
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_dataset.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+    )
+
+    val = qm9_data._str_to_float("1*^6")
+    assert val == 1e6
+
+    # check the function to list directory contents
+    files = qm9_data._list_files(str(prep_temp_dir), ".hdf5")
+
+    # check to see if test.hdf5 is in the files
+    assert "test.hdf5" in files
+
+
+def test_qm9_curation_parse_xyz(prep_temp_dir):
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_dataset.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+    )
+
+    # check to ensure we can parse the properties line correctly
+    # This is data is modified from dsgdb9nsd_000001.xyz, with floats truncated to one decimal place
+    temp_line = "gdb 1  157.7  157.7  157.7  0.  13.2  -0.3  0.1  0.5  35.3  0.0  -40.4  -40.4  -40.4  -40.4  6.4"
+    temp_dict = qm9_data._parse_properties(temp_line)
+
+    assert len(temp_dict) == 17
+    assert temp_dict["tag"] == "gdb"
+    assert temp_dict["idx"] == "1"
+    assert temp_dict["rotational constant A"] == 157.7 * unit.gigahertz
+    assert temp_dict["rotational constant B"] == 157.7 * unit.gigahertz
+    assert temp_dict["rotational constant C"] == 157.7 * unit.gigahertz
+    assert temp_dict["dipole moment"] == 0 * unit.debye
+    assert temp_dict["isotropic polarizability"] == 13.2 * unit.angstrom**3
+    assert temp_dict["energy of homo"] == -0.3 * unit.hartree
+    assert temp_dict["energy of lumo"] == 0.1 * unit.hartree
+    assert temp_dict["gap"] == 0.5 * unit.hartree
+    assert temp_dict["electronic spatial extent"] == 35.3 * unit.angstrom**2
+    assert temp_dict["zero point vibrational energy"] == 0.0 * unit.hartree
+    assert temp_dict["internal energy at 0K"] == -40.4 * unit.hartree
+    assert temp_dict["internal energy at 298.15K"] == -40.4 * unit.hartree
+    assert temp_dict["enthalpy at 298.15K"] == -40.4 * unit.hartree
+    assert temp_dict["free energy at 298.15K"] == -40.4 * unit.hartree
+    assert (
+        temp_dict["heat capacity at 298.15K"]
+        == 6.4 * unit.calorie_per_mole / unit.kelvin
+    )
+
+    fn = resources.files("modelforge").joinpath("tests", "data", "dsgdb9nsd_000001.xyz")
+    data_dict_temp = qm9_data._parse_xyzfile(str(fn))
+
+    # spot check values
+    assert np.all(
+        np.isclose(
+            data_dict_temp["geometry"],
+            np.array(
+                [
+                    [-1.26981359e-03, 1.08580416e-01, 8.00099580e-04],
+                    [2.15041600e-04, -6.03131760e-04, 1.97612040e-04],
+                    [1.01173084e-01, 1.46375116e-01, 2.76574800e-05],
+                    [-5.40815069e-02, 1.44752661e-01, -8.76643715e-02],
+                    [-5.23813634e-02, 1.43793264e-01, 9.06397294e-02],
+                ]
+            )
+            * unit.nanometer,
+        )
+    )
+
+    assert np.all(
+        data_dict_temp["charges"]
+        == np.array([-0.535689, 0.133921, 0.133922, 0.133923, 0.133923])
+        * unit.elementary_charge
+    )
+    assert data_dict_temp["isotropic polarizability"] == 13.21 * unit.angstroms**3
+    assert (
+        data_dict_temp["energy of homo"]
+        == -1017.9062102263447 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["energy of lumo"] == 307.4460077830925 * unit.kilojoule_per_mole
+    )
+    assert data_dict_temp["gap"] == 1325.3522180094374 * unit.kilojoule_per_mole
+    assert data_dict_temp["electronic spatial extent"] == 35.3641 * unit.angstrom**2
+    assert (
+        data_dict_temp["zero point vibrational energy"]
+        == 117.4884833670846 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["internal energy at 0K"]
+        == -106277.4161215308 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["internal energy at 298.15K"]
+        == -106269.88618856476 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["enthalpy at 298.15K"]
+        == -106267.40509140545 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["free energy at 298.15K"]
+        == -106329.05182294044 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["heat capacity at 298.15K"]
+        == 0.027066296000000004 * unit.kilojoule_per_mole / unit.kelvin
+    )
+    assert np.all(data_dict_temp["atomic numbers"] == np.array([6, 1, 1, 1, 1]))
+    assert data_dict_temp["smiles gdb-17"] == "C"
+    assert data_dict_temp["smiles b3lyp"] == "C"
+    assert data_dict_temp["inchi Corina"] == "1S/CH4/h1H4"
+    assert data_dict_temp["inchi B3LYP"] == "1S/CH4/h1H4"
+    assert data_dict_temp["rotational constant A"] == 157.7118 * unit.gigahertz
+    assert data_dict_temp["rotational constant B"] == 157.70997 * unit.gigahertz
+    assert data_dict_temp["rotational constant C"] == 157.70699 * unit.gigahertz
+    assert data_dict_temp["dipole moment"] == 0.0 * unit.debye
+    assert np.all(
+        data_dict_temp["harmonic vibrational frequencies"]
+        == np.array(
+            [
+                1341.307,
+                1341.3284,
+                1341.365,
+                1562.6731,
+                1562.7453,
+                3038.3205,
+                3151.6034,
+                3151.6788,
+                3151.7078,
+            ]
+        )
+        / unit.centimeter
+    )
+
+
+def test_qm9_local_archive(prep_temp_dir):
+    # test file extraction, parsing, and generation of hdf5 file
+    # from a local archive.
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_test10.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+    )
+
+    fn = resources.files("modelforge").joinpath("tests", "data")
+
+    qm9_data._process_downloaded(str(fn), "first10.tar.bz2", unit_testing=True)
+
+    assert len(qm9_data.data) == 10
+
+    file_name_path = str(fn) + "/first10.tar.bz2"
+    assert os.path.isfile(file_name_path)
+
+    names = {
+        "dsgdb9nsd_000001": -106277.4161215308,
+        "dsgdb9nsd_000002": -148408.69593977975,
+        "dsgdb9nsd_000003": -200600.51755556674,
+        "dsgdb9nsd_000004": -202973.24721725564,
+        "dsgdb9nsd_000005": -245252.87826713378,
+        "dsgdb9nsd_000006": -300576.6846578527,
+        "dsgdb9nsd_000007": -209420.75231941737,
+        "dsgdb9nsd_000008": -303715.5298633426,
+        "dsgdb9nsd_000009": -306158.32885940996,
+        "dsgdb9nsd_000010": -348451.454977435,
+    }
+    file_name_path = str(prep_temp_dir) + "/qm9_test10.hdf5"
+
+    with h5py.File(file_name_path, "r") as hf:
+        for key in hf.keys():
+            # check record names
+            assert key in list(names.keys())
+            assert np.isclose(hf[key]["internal energy at 0K"][()], names[key])
+
+
+def test_qm9_download(prep_temp_dir):
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_test10.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+    )
+    name = qm9_data.dataset_description["dataset_filename"]
+    url = qm9_data.dataset_description["dataset_download_url"]
+
+    qm9_data._download(
+        url=url,
+        name=name,
+        output_path=str(prep_temp_dir),
+        force_download=False,
+    )
+
+    file_name_path = str(prep_temp_dir) + f"/{name}"
+    assert os.path.isfile(file_name_path)
+
+
+def test_qm9_curation(prep_temp_dir):
+    # test file download and extraction
+    # this downloads the entire archive and extracts it
+    # but only processes the first 10 records
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_dataset.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+    )
+
+    # test all the functions will run
+    qm9_data.process(unit_testing=True)
+
+    name = qm9_data.dataset_description["dataset_filename"]
+
+    file_name_path = str(prep_temp_dir) + f"/{name}"
+    assert os.path.isfile(file_name_path)
+
+    # ensure we processed 10 records
+    assert len(qm9_data.data) == 10
+
+    file_name_path = str(prep_temp_dir) + "/qm9_dataset.hdf5"
+    assert os.path.isfile(file_name_path)
+
+    names = {
+        "dsgdb9nsd_000001": -106277.4161215308,
+        "dsgdb9nsd_000002": -148408.69593977975,
+        "dsgdb9nsd_000003": -200600.51755556674,
+        "dsgdb9nsd_000004": -202973.24721725564,
+        "dsgdb9nsd_000005": -245252.87826713378,
+        "dsgdb9nsd_000006": -300576.6846578527,
+        "dsgdb9nsd_000007": -209420.75231941737,
+        "dsgdb9nsd_000008": -303715.5298633426,
+        "dsgdb9nsd_000009": -306158.32885940996,
+        "dsgdb9nsd_000010": -348451.454977435,
+    }
+
+    with h5py.File(file_name_path, "r") as hf:
+        for key in hf.keys():
+            # check record names
+            assert key in list(names.keys())
+            assert np.isclose(hf[key]["internal energy at 0K"][()], names[key])

From ec8d24d4c81d455f16e71ca0548024d4475dc368 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Tue, 29 Aug 2023 15:30:05 -0700
Subject: [PATCH 07/11] Added datafiles for testing.

---
 modelforge/tests/data/dsgdb9nsd_000001.xyz |  10 ++++++++++
 modelforge/tests/data/first10.tar.bz2      | Bin 0 -> 2616 bytes
 2 files changed, 10 insertions(+)
 create mode 100644 modelforge/tests/data/dsgdb9nsd_000001.xyz
 create mode 100644 modelforge/tests/data/first10.tar.bz2

diff --git a/modelforge/tests/data/dsgdb9nsd_000001.xyz b/modelforge/tests/data/dsgdb9nsd_000001.xyz
new file mode 100644
index 00000000..bd998ede
--- /dev/null
+++ b/modelforge/tests/data/dsgdb9nsd_000001.xyz
@@ -0,0 +1,10 @@
+5
+gdb 1	157.7118	157.70997	157.70699	0.	13.21	-0.3877	0.1171	0.5048	35.3641	0.044749	-40.47893	-40.476062	-40.475117	-40.498597	6.469	
+C	-0.0126981359	 1.0858041578	 0.0080009958	-0.535689
+H	 0.002150416	-0.0060313176	 0.0019761204	 0.133921
+H	 1.0117308433	 1.4637511618	 0.0002765748	 0.133922
+H	-0.540815069	 1.4475266138	-0.8766437152	 0.133923
+H	-0.5238136345	 1.4379326443	 0.9063972942	 0.133923
+1341.307	1341.3284	1341.365	1562.6731	1562.7453	3038.3205	3151.6034	3151.6788	3151.7078
+C	C	
+InChI=1S/CH4/h1H4	InChI=1S/CH4/h1H4
diff --git a/modelforge/tests/data/first10.tar.bz2 b/modelforge/tests/data/first10.tar.bz2
new file mode 100644
index 0000000000000000000000000000000000000000..8426d518ca6d94dcc5cd8f22a5c88cfb1f3f1c4d
GIT binary patch
literal 2616
zcmV-83di+AT4*^jL0KkKS*H`@k^l_H|I_?10H8>R|KbQ`hyeZJ9&rFb1ONa)03ZMe
zU=05IKCe|?AVBTGYBpli5t!D%(M@Ysw3@Sz15?lwO%e$p4G&NN003%XqeQA9>KQa-
z00SmX4Kx@40004?&<22L2B(k&kxf(eLqVVb00TfI)JkDYPgKdKhJXR;Gyni9RZZ%i
zrcD4DJtmJ+LqG;kP}9nCA=*Pv^Z6mvLaknM4I;732nU2hGuEbRQG+#1LNy~CiX@Gx
zA)c!&#k-E0G?DcZg!;t@0;OH+vyY<nb4{hYF78QWwyM(wV;yp-pvkzj5gcSg>N1jq
z##-^R!!GdQ2(S+tZh9!(OT1TOq}yZF&ID=eJ^6%kEk)^Ls8g;5)VU7DP4kY5%8V>x
zP6qh8I%MWZY*NDzl{rx^o#Gnc)FVX-seU}f(yf%}-Ap}0*)XY;HC)92qUcpq7;^0j
zn02CIG)uUGvK(2JPQIAenA=$;Gz6%P+kRy#c&K(yB#ih?NrN$06B{{KvhdETl6!U!
zm}Q+wQxj{AHRyZ#AeYm;&q<j?GdN3yp4@e#1n*`1-~w73#uUO;nPwnFT2NR*f-aam
z;NZ$urHsW%CEEUgaDD;7Lx7G9f)+zUNe+$<{c@*Wei{S^7<9?3v*DhMod9n(3`rz4
z6qyMQV!)Je1Yxr$Gi(81ScMFLvJ`-CZJ9CJq~eZ1q_~R^3lxvJKxwQCg0V;;$YFtS
z2aLc{f^KBIvr*(^H!vm0L2#3mf#j;=rj+Gd@l(^d4DG_koeP=aEh;4<OddF8%Pd0^
z6LknMLT8m>lN2q&F!;rt2(sk4h7K?&T!7DS7HG3uaAH^kf|BMs;WABfSQO|)h<nC$
zH&7V6)ZSix&^SItmQm6hj6kK&1A_;6hXMrRG#v0`4^o0Cps|fIg`}dgx)8dh5lc%D
z-9c4Eqzr9Rky7du8`6!{MkrKbY}78REaeU2;GiyT(NMHjMVaV8nIs9zq=Nw==OB>p
z@OcnyhZ-hq1s7LZC@Cz1!l@wahBr}(7!@u%MlmXkY0?Qe;J{U=SJ4b{RO+YLqy}LG
z@-`YT?gkVOB`ylW!KvBPu!6D}+uWU1D5%QI$5Aj7sE%Yr?2)%P!otC-r3#iinuNR+
zS85oh$CW5U#48XLzSk*AcXmfb4g&6B83{JAO@K5T1zEJs&I(c{ts50n6}V(gnhx;-
zGMhyQ7q4PxhX{P+>kbIk5s))PjL>b(!DNa`+O0xLt@QR<+zZ!FmnLPuh#dZa5u=|Z
z0!gqzbiJkqTTAHHi%Jjxrvc9WQr-`P*hn5a`R$l`OvqO~kzpS8VH3YP)Q9A}5S9)2
z1H^!CVjVz8Etm_m@f5pY;^VFwR~&PC7t%nq+=;C)(2<s*IOTK#b8Mh>Lb1Iil_R6Y
zjcLorP<jFe*dfi_vkVDtXSl#n@F0z;OFHK_kO-pV7dyy{YaG~{E-ATZQ&~#f>a-I8
zd8!5OQ1@eREh@0cgWj-XF~+xE%^V(7xLFfars;6_vnhj-EC@idnNTgm4uuQpfIh+r
z$J_=4j<<3T8JwP-+7pbtm_>w_RqqJ7PA|p*(Pl$RHF<^f4dZOjqm!cbKo=D=?rdlb
zEjmv+b|sqCoOuBr-dmlPYb?W3CK06vj8t0^3~G)oY7emj+%|iF9LA|y`;stR&qFTf
zDtDP#1iD!x$T%hK+VTUe5vOjXH4VD9>I^|~1X5H~p94d{frf;;?_Mno*$z)<SmSr9
zS~lgZDtV^l5eV6mEtL!^nxq*uG{-sBy}jo*dc{z-#5!<!9?+h`P!9lD?bkcweJc*#
zrePsdC7V6+vSwI1D~M=+N3norZ7fr__&3;Cp@jx~>6(f5WO_8*1Oxf>X3vBp>Pi6d
zr`%?@8?kakQ`3bqRlh2ytdaJ>J<>+=%RX50u2*d!h>NJx&b!ZE90RQod|oGd<#!N^
z3R|Os+#ac3Jf1!5JovJ&x^ZE>S#Qb|GG3C9ObgjOV#A_40=pcpZ5P1-J?XrPy`|Bv
z+fsrRjnPxy)YSlwq}%K}ZFUyZVl|=Ucmt7dmBjgdKVvYw53(K0r8aC@zno-XP{7t!
z8&_b`z<0tHP`-8p4<JT|eeDNFMj;he8!QnfA8R4;F$dL>6Z)HL26C<2HK~I#p^OBO
zBT=PLpgSxnmzgXyRJriW27==QDB+N?syM(YPkJztZLY@#m{8^WR>LLf66Wz;CZ?;H
z0wBeSyc`Z{os&+HEaok$G`)<&-3&?YEDv?B5@^I$LxU6?sI%2G+4L|`dh3m35K+Ca
zinCE2q{ZMHVFV9~92Gi<KD3Jib_<_~9?y!7zdP4-whMQm_)il=3Z*y5AX(%kf+W1C
zNN};E+Ehm&Ts?^sO`@)3;l0d>aY+}QDDh;qN}Ph*#h*VN410tkDFROQwA)Z*Xz&4<
znHn`;Tf_oc)FF&3sMDAxpg>)Bby1F;y^0#w1_q{Kb*CkseGZ`DP8`DHU+Srt*90FD
zZWWP!TZ1<OA?c#Nd>gT{HNHkiw@bn=gyu0)0)R05{jivq)ejt3@!Xz~c}+}sunG*3
zkxmjW(=Aj}Oe-JQ?cKp1c=An^0jeE#OBaP#++)zMc7YRi>Y8Y965zJW^McH)eZf1l
zr)wF~Z4)Rbx=S{l#P0)8aU?}5)p9Bx8&qXMoWu(och!(ub=((m6H!wZ?hxE*_^oJs
z_q|vIl*UTYZu~$wtF>2d$o7_38ICiAfYT^Po-x^M5ZtJ&5z=<vrjA}aQ%Prh4ub8;
z90$4BJr&Ug`lRHtmYaU!%yKnM*Cd{4(~Y;4ck%(Jm9!X;7;rmWw)LX`fVRV6)N(Yl
zJjA-HO;w>mJWxz_=N?Wp=D@twyJnlXXOyP&DJ?fRe6lu^3jE|0Im&Gs@UxhM!Wz`r
zZXzBt!c8>z`Kjl{UKVEKa@E4X-rlPHU}sm8b#A59`k^nkt<bxf)!Oe^Yz)+nD69{;
zJ+L33MY5r?YAPoLV2LAQSc#px6(aE$n%#vqVapnZ87l7QE^{T5OL5AnZ89b1n;MQo
z3dg8muNvkwY2AUN5>)D|M4MQ3WQUE4+b@xCbuaE46se$wP7rHhm|Bp}c*r@B{C;zE
zv<i?4=6qt*MprEHq)<;?Y(^~$rP;-DuJY&L2^ERv=7#FU>q!A6=LjPUSy8tYQf`KV
zJV-|RafLa|8;oONOC!07>TSl&VJ9HHFvELOHY`FxkxuudZh;dq1dR`HS^mMn`-D2r
zdhOInR^3`vS7;Y%t3XwKLJX33HB?BM85d&_5{fo37!$-0%SBaHRaNXE%vH)9&cQr-
zWx8F~r&ycF)Q#j?MglleWJRd7je9Sy?ag^_yYU~$$c!lBBPkzfu#G6fk9257eF)f#
ak-kO9tHm6My%dr^#oUoj6eKCc_~d|WGOM%z

literal 0
HcmV?d00001


From 4d69930290294e5dd007803002929a2eceb09b37 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Tue, 29 Aug 2023 15:35:20 -0700
Subject: [PATCH 08/11] Added in missing packages to dev conf file.

---
 devtools/conda-envs/test_env.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
index 6828d078..4ee2dd7c 100644
--- a/devtools/conda-envs/test_env.yaml
+++ b/devtools/conda-envs/test_env.yaml
@@ -19,6 +19,8 @@ dependencies:
   - lightning
   - tensorboard
   - torchvision
+  - openff-units
+  - pint
 
     # Testing
   - pytest

From 327c5fa4a3cf51bc89ada1f794764713570ecb6c Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Wed, 30 Aug 2023 17:22:17 -0700
Subject: [PATCH 09/11] Refactored qm9_curation. Moved several function to
 utils that can be generalized; movement will make testing functions easier.

---
 modelforge/curation/qm9_curation.py |  99 ++----------------------
 modelforge/curation/utils.py        | 113 ++++++++++++++++++++++++++++
 modelforge/tests/test_curation.py   |  72 ++++++++++++------
 3 files changed, 166 insertions(+), 118 deletions(-)

diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
index 7503ef81..b507fbbd 100644
--- a/modelforge/curation/qm9_curation.py
+++ b/modelforge/curation/qm9_curation.py
@@ -59,15 +59,14 @@ def __init__(
         self.convert_units = convert_units
 
         # Below, we define key pieces of information related to the dataset in the form of a dict.
-        # `dataset_download_url` and `dataset_filename` are used by the code to fetch the data.
+        # `dataset_download_url` is only the only variable used by the code to fetch the data.
         # All other data is metadata that will be used to generate a README to go along with
-        # the HDF5 dataset.
+        # the HDF5 dataset and to document the key info within the code.
         self.dataset_description = {
             "publication_doi": "10.1038/sdata.2014.22",
             "figshare_dataset_doi": "10.6084/m9.figshare.c.978904.v5",
             "figshare_dataset_url": "https://springernature.figshare.com/articles/dataset/Data_for_6095_constitutional_isomers_of_C7H10O2/1057646/2",
-            "dataset_download_url": "https://ndownloader.figshare.com/files/3195389",
-            "dataset_filename": "dsgdb9nsd.xyz.tar.bz2",
+            "dataset_download_url": "https://springernature.figshare.com/ndownloader/files/3195389",
             "publication_citation": """
                 Ramakrishnan, R., Dral, P., Rupp, M. et al. 
                     Quantum chemistry structures and properties of 134 kilo molecules. 
@@ -99,63 +98,6 @@ def __init__(
             "heat capacity at 298.15K": unit.kilojoule_per_mole / unit.kelvin,
         }
 
-    def _mkdir(self, path: str) -> None:
-        if not os.path.exists(path):
-            try:
-                os.makedirs(path)
-            except Exception:
-                print("Could not create directory {path}.")
-
-    def _download(
-        self, url: str, name: str, output_path: str, force_download=False
-    ) -> None:
-        """
-        Downloads the dataset tar.bz2 file from figshare.
-
-        Parameters
-        ----------
-        url: str, required
-            Figshare url to the data downloader
-        name: str, required
-            Name of the file downloaded
-        output_path: str, required
-            Location to download the file to.
-        force_download: str, default=False
-            If False, the file is not downloaded if it already exists in the directory.
-            If True, the file will be downloaded even if it exists.
-
-        """
-
-        if not os.path.isfile(f"{output_path}/{name}") or force_download:
-            logger.debug(f"Downloading datafile from figshare to {output_path}/{name}.")
-            chunk_size = 512
-
-            # get the head of the request
-            head = requests.head(url)
-
-            # because the url on figshare calls downloader, instead of the direct file,
-            # we need to figure out what the original file is to know how big it is
-            # here we will parse the header info to get the file the downloader links to
-            # and then get the head info from this link to fetch the length
-            # this is not actually necessary, but useful for updating download status bar
-            temp_url = head.headers["location"].split("?")[0]
-            length = int(requests.head(temp_url).headers["Content-Length"])
-
-            r = requests.get(url, stream=True)
-
-            self._mkdir(output_path)
-
-            with open(f"{output_path}/{name}", "wb") as fd:
-                for chunk in tqdm(
-                    r.iter_content(chunk_size=chunk_size),
-                    ascii=True,
-                    desc="downloading",
-                    total=(int(length / chunk_size) + 1),
-                ):
-                    fd.write(chunk)
-        else:  # if the file exists and we don't set force_download to True, just use the cached version
-            logger.debug("Datafile exists, using cached file.")
-
     def _extract(self, file_path: str, cache_directory: str) -> None:
         """
         Extract the contents of a tar.bz2 file.
@@ -359,33 +301,6 @@ def _parse_xyzfile(self, file_name: str) -> dict:
 
         return data
 
-    def _list_files(self, directory: str, extension: str) -> list:
-        """
-        Returns a list of files in a directory with a given extension.
-
-        Parameters
-        ----------
-        directory: str, required
-            Directory of interest.
-        extension: str, required
-            Only consider files with this given file extension
-
-        Returns
-        -------
-        list
-            List of files in the given directory with desired extension.
-
-        """
-
-        logger.debug(f"Gathering xyz data files in {directory}.")
-
-        files = []
-        for file in os.listdir(directory):
-            if file.endswith(extension):
-                files.append(file)
-        files.sort()
-        return files
-
     def _process_downloaded(
         self, local_path_to_tar: str, name: str, unit_testing: bool
     ):
@@ -396,7 +311,7 @@ def _process_downloaded(
         )
 
         # list the files in the directory to examine
-        files = self._list_files(directory=self.local_cache_dir, extension=".xyz")
+        files = list_files(directory=self.local_cache_dir, extension=".xyz")
 
         # parse the information in each datat file, saving to a list of dicts, data
         self.data = []
@@ -409,7 +324,7 @@ def _process_downloaded(
             data_temp = self._parse_xyzfile(f"{self.local_cache_dir}/{file}")
             self.data.append(data_temp)
 
-        self._mkdir(self.output_file_path)
+        mkdir(self.output_file_path)
 
         full_output_path = f"{self.output_file_path}/{self.hdf5_file_name}"
 
@@ -436,13 +351,11 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
         >>> qm9_data.process()
 
         """
-        name = self.dataset_description["dataset_filename"]
         url = self.dataset_description["dataset_download_url"]
 
         # download the dataset
-        self._download(
+        name = download_from_figshare(
             url=url,
-            name=name,
             output_path=self.local_cache_dir,
             force_download=force_download,
         )
diff --git a/modelforge/curation/utils.py b/modelforge/curation/utils.py
index 4c4d216e..d863f59a 100644
--- a/modelforge/curation/utils.py
+++ b/modelforge/curation/utils.py
@@ -3,6 +3,8 @@
 from openff.units import unit, Quantity
 import numpy as np
 from tqdm import tqdm
+import os
+from loguru import logger
 
 # define new context for converting energy to energy/mol
 
@@ -63,3 +65,114 @@ def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
                         group.create_dataset(name=key, data=val_m, shape=val_m.shape)
                     if not val_u is None:
                         group[key].attrs["units"] = val_u
+
+
+def mkdir(path: str) -> bool:
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+            return True
+        except Exception:
+            print("Could not create directory {path}.")
+    else:
+        return False
+
+
+def download_from_figshare(url: str, output_path: str, force_download=False) -> str:
+    """
+    Downloads a dataset from figshare.
+
+    Parameters
+    ----------
+    ndownloader_url: str, required
+        Figshare url to the data downloader
+    output_path: str, required
+        Location to download the file to.
+    force_download: str, default=False
+        If False: if the file does not exist in output_path it will use the local version.
+        If True, the file will be downloaded even if it exists in output_path.
+
+    Returns
+    -------
+    str
+        Name of the file downloaded.
+
+    Examples
+    --------
+    >>> url = 'https://springernature.figshare.com/ndownloader/files/18112775'
+    >>> output_path = '/path/to/directory'
+    >>> downloaded_file_name = download_from_figshare(url, output_path)
+
+    """
+
+    import requests
+
+    chunk_size = 512
+
+    # get the head of the request
+    head = requests.head(url)
+    # Because the url on figshare calls a downloader, instead of the direct file,
+    # we need to figure out where the original file is to know how big it is.
+    # Here we will parse the header info to get the file the downloader links to
+    # and then get the head info from this link to fetch the length.
+    # This is not actually necessary, but useful for updating download status bar.
+    # We also fetch the name of the file from the header of the download link
+    temp_url = head.headers["location"].split("?")[0]
+    name = head.headers["X-Filename"].split("/")[-1]
+
+    logger.debug(f"Downloading datafile from figshare to {output_path}/{name}.")
+
+    if not os.path.isfile(f"{output_path}/{name}") or force_download:
+        length = int(requests.head(temp_url).headers["Content-Length"])
+
+        r = requests.get(url, stream=True)
+
+        mkdir(output_path)
+
+        with open(f"{output_path}/{name}", "wb") as fd:
+            for chunk in tqdm(
+                r.iter_content(chunk_size=chunk_size),
+                ascii=True,
+                desc="downloading",
+                total=(int(length / chunk_size) + 1),
+            ):
+                fd.write(chunk)
+    else:  # if the file exists and we don't set force_download to True, just use the cached version
+        logger.debug(f"Datafile {name} already exists in {output_path}.")
+        logger.debug(
+            "Using already downloaded file; use force_download=True to re-download."
+        )
+
+    return name
+
+
+def list_files(directory: str, extension: str) -> list:
+    """
+    Returns a list of files in a directory with a given extension.
+
+    Parameters
+    ----------
+    directory: str, required
+        Directory of interest.
+    extension: str, required
+        Only consider files with this given file extension
+
+    Returns
+    -------
+    list
+        List of files in the given directory with desired extension.
+
+    Examples
+    --------
+    List only the xyz files in a test_directory
+    >>> files = list_files('test_directory', '.xyz')
+    """
+
+    logger.debug(f"Gathering {extension} files in {directory}.")
+
+    files = []
+    for file in os.listdir(directory):
+        if file.endswith(extension):
+            files.append(file)
+    files.sort()
+    return files
diff --git a/modelforge/tests/test_curation.py b/modelforge/tests/test_curation.py
index ce4bc20b..2a0fdc2b 100644
--- a/modelforge/tests/test_curation.py
+++ b/modelforge/tests/test_curation.py
@@ -18,8 +18,6 @@ def prep_temp_dir(tmp_path_factory):
     fn = tmp_path_factory.mktemp("hdf5_data")
     return fn
 
-    # generate test data into a temporary path
-
 
 def test_dict_to_hdf5(prep_temp_dir):
     # generate an hdf5 file from simple test data
@@ -86,6 +84,42 @@ def test_dict_to_hdf5(prep_temp_dir):
                 assert records[i][key] == test_data[i][key]
 
 
+def test_figshare_download(prep_temp_dir):
+    url = "https://figshare.com/ndownloader/files/22247589"
+    name = download_from_figshare(
+        url=url,
+        output_path=str(prep_temp_dir),
+        force_download=False,
+    )
+
+    file_name_path = str(prep_temp_dir) + f"/{name}"
+    assert os.path.isfile(file_name_path)
+
+
+def test_mkdir(prep_temp_dir):
+    # thest the mkdir helper function that checks if a directory
+    # exists before creating
+    # the function returns a status: True if had to create the directory
+    # and False if it did not have to create the directory
+
+    new_dir = str(prep_temp_dir) + "/new_subdir"
+    # first assert the new directory does not exist
+    assert os.path.exists(new_dir) == False
+
+    # make the new directory and assert it now exists
+    status = mkdir(new_dir)
+    # if status == True, it created the directory because it didn't exist
+    assert status == True
+    assert os.path.exists(new_dir) == True
+
+    # try to make the directory even though it exists
+    # it should return False indicating it already existed
+    # and it did not try to create it
+    status = mkdir(new_dir)
+    assert os.path.exists(new_dir) == True
+    assert status == False
+
+
 def test_qm9_curation_helper_functions(prep_temp_dir):
     qm9_data = QM9_curation(
         hdf5_file_name="qm9_dataset.hdf5",
@@ -97,7 +131,8 @@ def test_qm9_curation_helper_functions(prep_temp_dir):
     assert val == 1e6
 
     # check the function to list directory contents
-    files = qm9_data._list_files(str(prep_temp_dir), ".hdf5")
+    # test.hdf5 was generated in test_dict_to_hdf5
+    files = list_files(str(prep_temp_dir), ".hdf5")
 
     # check to see if test.hdf5 is in the files
     assert "test.hdf5" in files
@@ -263,28 +298,14 @@ def test_qm9_local_archive(prep_temp_dir):
             assert np.isclose(hf[key]["internal energy at 0K"][()], names[key])
 
 
-def test_qm9_download(prep_temp_dir):
-    qm9_data = QM9_curation(
-        hdf5_file_name="qm9_test10.hdf5",
-        output_file_path=str(prep_temp_dir),
-        local_cache_dir=str(prep_temp_dir),
-    )
-    name = qm9_data.dataset_description["dataset_filename"]
-    url = qm9_data.dataset_description["dataset_download_url"]
-
-    qm9_data._download(
-        url=url,
-        name=name,
-        output_path=str(prep_temp_dir),
-        force_download=False,
-    )
-
-    file_name_path = str(prep_temp_dir) + f"/{name}"
-    assert os.path.isfile(file_name_path)
-
-
+"""
+# I refactored the code such that the figshare downloader into a separate function
+# allowing us to test downloading on a smaller, more manageable file
+# Extraction and processing of the qm9 dataset is tested
+# based on .tar.bz2 file that exists in tests/data that contains only 10 
+# I'm going to leave this code in place, but commented out for now
+# as we will eventually want a non-CI testing suite.
 def test_qm9_curation(prep_temp_dir):
-    # test file download and extraction
     # this downloads the entire archive and extracts it
     # but only processes the first 10 records
     qm9_data = QM9_curation(
@@ -296,7 +317,7 @@ def test_qm9_curation(prep_temp_dir):
     # test all the functions will run
     qm9_data.process(unit_testing=True)
 
-    name = qm9_data.dataset_description["dataset_filename"]
+    name = "dsgdb9nsd.xyz.tar.bz2"
 
     file_name_path = str(prep_temp_dir) + f"/{name}"
     assert os.path.isfile(file_name_path)
@@ -325,3 +346,4 @@ def test_qm9_curation(prep_temp_dir):
             # check record names
             assert key in list(names.keys())
             assert np.isclose(hf[key]["internal energy at 0K"][()], names[key])
+"""

From 0832992f7480c8c1f5a796c647df9a3e9c667ac6 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Wed, 30 Aug 2023 22:34:07 -0700
Subject: [PATCH 10/11] Minimal mockup of automatic metadata generation for
 zenodo description.

---
 modelforge/curation/ani_curation.py |  0
 modelforge/curation/qm9_curation.py | 63 ++++++++++++++++++++---------
 modelforge/tests/test_curation.py   |  2 +-
 3 files changed, 45 insertions(+), 20 deletions(-)
 create mode 100644 modelforge/curation/ani_curation.py

diff --git a/modelforge/curation/ani_curation.py b/modelforge/curation/ani_curation.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
index b507fbbd..e5835334 100644
--- a/modelforge/curation/qm9_curation.py
+++ b/modelforge/curation/qm9_curation.py
@@ -57,7 +57,9 @@ def __init__(
         self.output_file_path = output_file_path
         self.hdf5_file_name = hdf5_file_name
         self.convert_units = convert_units
-
+        self.dataset_download_url = (
+            "https://springernature.figshare.com/ndownloader/files/3195389"
+        )
         # Below, we define key pieces of information related to the dataset in the form of a dict.
         # `dataset_download_url` is only the only variable used by the code to fetch the data.
         # All other data is metadata that will be used to generate a README to go along with
@@ -67,21 +69,9 @@ def __init__(
             "figshare_dataset_doi": "10.6084/m9.figshare.c.978904.v5",
             "figshare_dataset_url": "https://springernature.figshare.com/articles/dataset/Data_for_6095_constitutional_isomers_of_C7H10O2/1057646/2",
             "dataset_download_url": "https://springernature.figshare.com/ndownloader/files/3195389",
-            "publication_citation": """
-                Ramakrishnan, R., Dral, P., Rupp, M. et al. 
-                    Quantum chemistry structures and properties of 134 kilo molecules. 
-                    Sci Data 1, 140022 (2014). 
-                    https://doi.org/10.1038/sdata.2014.22
-                """,
-            "dataset_citation": """
-                    Ramakrishnan, Raghunathan; Dral, Pavlo; Rupp, Matthias; Anatole von Lilienfeld, O. (2014). 
-                    Quantum chemistry structures and properties of 134 kilo molecules. 
-                    figshare. Collection. https://doi.org/10.6084/m9.figshare.c.978904.v5
-                """,
-            "description": """
-                QM9 Dataset: Includes 133,885 organic molecules with up to nine heavy atoms (CONF). 
-                ll properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.
-                """,
+            "publication_citation": "Ramakrishnan, R., Dral, P., Rupp, M. et al. Quantum chemistry structures and properties of 134 kilo molecules. Sci Data 1, 140022 (2014).",
+            "dataset_citation": "Ramakrishnan, Raghunathan; Dral, Pavlo; Rupp, Matthias; Anatole von Lilienfeld, O. (2014). Quantum chemistry structures and properties of 134 kilo molecules. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.978904.v5",
+            "description": "QM9 Dataset: Includes 133,885 organic molecules with up to nine heavy atoms (CONF). All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.",
         }
         # if convert_units is True we will
         # convert the following units
@@ -351,13 +341,48 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
         >>> qm9_data.process()
 
         """
-        url = self.dataset_description["dataset_download_url"]
+        url = self.dataset_download_url
 
         # download the dataset
-        name = download_from_figshare(
+        self.name = download_from_figshare(
             url=url,
             output_path=self.local_cache_dir,
             force_download=force_download,
         )
         # process the rest of the dataset
-        self._process_downloaded(self.local_cache_dir, name, unit_testing)
+        if self.name is None:
+            raise Exception("Failed to retrieve name of file from figshare.")
+        self._process_downloaded(self.local_cache_dir, self.name, unit_testing)
+
+    def _generate_metadata(self):
+        with open(
+            f"{self.output_file_path}/{self.hdf5_file_name}.metadata", "w"
+        ) as f_md:
+            f_md.write("Dataset Description:\n")
+            f_md.write(self.dataset_description["description"])
+            f_md.write("\n\nPublication Citation:\n")
+            f_md.write(self.dataset_description["publication_citation"])
+            f_md.write("\n\nPublication DOI:\n")
+            f_md.write(self.dataset_description["publication_doi"])
+            f_md.write("\n\nSource dataset DOI:\n")
+            f_md.write(self.dataset_description["figshare_dataset_url"])
+            f_md.write("\n\nSource dataset download URL:\n")
+            f_md.write(self.dataset_description["dataset_download_url"])
+
+            f_md.write("\n\nHDF5 dataset curated by modelforge:\n")
+            f_md.write(
+                "The top level of the HDF5 file contains entries for each record name.\n"
+            )
+            f_md.write(
+                "Each record contains the following data, where units, when appropriate, are stored as attributes.\n"
+            )
+            f_md.write("Unit naming conventions follow the openff-units package.\n\n")
+            f_md.write("property : type : units\n")
+            for key, val in self.data[0].items():
+                if isinstance(val, pint.Quantity):
+                    var_type = str(type(val.m).__name__)
+                    f_md.write(f"{key} : {var_type} : {val.u}\n")
+                else:
+                    var_type = str(type(val).__name__)
+
+                    f_md.write(f"{key} : {var_type} : N/A\n")
diff --git a/modelforge/tests/test_curation.py b/modelforge/tests/test_curation.py
index 2a0fdc2b..95a3c79a 100644
--- a/modelforge/tests/test_curation.py
+++ b/modelforge/tests/test_curation.py
@@ -270,7 +270,7 @@ def test_qm9_local_archive(prep_temp_dir):
 
     fn = resources.files("modelforge").joinpath("tests", "data")
 
-    qm9_data._process_downloaded(str(fn), "first10.tar.bz2", unit_testing=True)
+    qm9_data._process_downloaded(str(fn), "first10.tar.bz2", unit_testing=False)
 
     assert len(qm9_data.data) == 10
 

From 41c2c7e11a523be64b02f855f09d40cabe09b848 Mon Sep 17 00:00:00 2001
From: chrisiacovella <chris.iacovella@gmail.com>
Date: Thu, 31 Aug 2023 15:05:36 -0700
Subject: [PATCH 11/11] Updated tests.  Changing unit_testing argument to
 instead define the number of records to output as part of testing.  Useful
 for creating several smaller curated datasets.

---
 modelforge/curation/qm9_curation.py | 102 ++++++++++------
 modelforge/curation/utils.py        |  16 ++-
 modelforge/tests/test_curation.py   | 180 ++++++++++++----------------
 3 files changed, 154 insertions(+), 144 deletions(-)

diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
index e5835334..2cea83d0 100644
--- a/modelforge/curation/qm9_curation.py
+++ b/modelforge/curation/qm9_curation.py
@@ -77,15 +77,15 @@ def __init__(
         # convert the following units
         self.unit_output_dict = {
             "geometry": unit.nanometer,
-            "energy of homo": unit.kilojoule_per_mole,
-            "energy of lumo": unit.kilojoule_per_mole,
-            "gap": unit.kilojoule_per_mole,
-            "zero point vibrational energy": unit.kilojoule_per_mole,
-            "internal energy at 0K": unit.kilojoule_per_mole,
-            "internal energy at 298.15K": unit.kilojoule_per_mole,
-            "enthalpy at 298.15K": unit.kilojoule_per_mole,
-            "free energy at 298.15K": unit.kilojoule_per_mole,
-            "heat capacity at 298.15K": unit.kilojoule_per_mole / unit.kelvin,
+            "energy_of_homo": unit.kilojoule_per_mole,
+            "energy_of_lumo": unit.kilojoule_per_mole,
+            "lumo-homo_gap": unit.kilojoule_per_mole,
+            "zero_point_vibrational_energy": unit.kilojoule_per_mole,
+            "internal_energy_at_0K": unit.kilojoule_per_mole,
+            "internal_energy_at_298.15K": unit.kilojoule_per_mole,
+            "enthalpy_at_298.15K": unit.kilojoule_per_mole,
+            "free_energy_at_298.15K": unit.kilojoule_per_mole,
+            "heat_capacity_at_298.15K": unit.kilojoule_per_mole / unit.kelvin,
         }
 
     def _extract(self, file_path: str, cache_directory: str) -> None:
@@ -146,21 +146,21 @@ def _parse_properties(self, line: str) -> dict:
         labels_and_units = [
             ("tag", None),
             ("idx", None),
-            ("rotational constant A", unit.gigahertz),
-            ("rotational constant B", unit.gigahertz),
-            ("rotational constant C", unit.gigahertz),
-            ("dipole moment", unit.debye),
-            ("isotropic polarizability", unit.angstrom**3),
-            ("energy of homo", unit.hartree),
-            ("energy of lumo", unit.hartree),
-            ("gap", unit.hartree),
-            ("electronic spatial extent", unit.angstrom**2),
-            ("zero point vibrational energy", unit.hartree),
-            ("internal energy at 0K", unit.hartree),
-            ("internal energy at 298.15K", unit.hartree),
-            ("enthalpy at 298.15K", unit.hartree),
-            ("free energy at 298.15K", unit.hartree),
-            ("heat capacity at 298.15K", unit.calorie_per_mole / unit.kelvin),
+            ("rotational_constant_A", unit.gigahertz),
+            ("rotational_constant_B", unit.gigahertz),
+            ("rotational_constant_C", unit.gigahertz),
+            ("dipole_moment", unit.debye),
+            ("isotropic_polarizability", unit.angstrom**3),
+            ("energy_of_homo", unit.hartree),
+            ("energy_of_lumo", unit.hartree),
+            ("lumo-homo_gap", unit.hartree),
+            ("electronic_spatial_extent", unit.angstrom**2),
+            ("zero_point_vibrational_energy", unit.hartree),
+            ("internal_energy_at_0K", unit.hartree),
+            ("internal_energy_at_298.15K", unit.hartree),
+            ("enthalpy_at_298.15K", unit.hartree),
+            ("free_energy_at_298.15K", unit.hartree),
+            ("heat_capacity_at_298.15K", unit.calorie_per_mole / unit.kelvin),
         ]
 
         assert len(labels_and_units) == len(temp_prop)
@@ -254,16 +254,16 @@ def _parse_xyzfile(self, file_name: str) -> dict:
 
             data = {}
             data["name"] = file_name.split("/")[-1].split(".")[0]
-            data["smiles gdb-17"] = smiles[0]
-            data["smiles b3lyp"] = smiles[1]
-            data["inchi Corina"] = InChI.split("\n")[0].split()[0].replace("InChI=", "")
-            data["inchi B3LYP"] = InChI.split("\n")[0].split()[1].replace("InChI=", "")
+            data["smiles_gdb-17"] = smiles[0]
+            data["smiles_b3lyp"] = smiles[1]
+            data["inchi_Corina"] = InChI.split("\n")[0].split()[0].replace("InChI=", "")
+            data["inchi_B3LYP"] = InChI.split("\n")[0].split()[1].replace("InChI=", "")
             data["geometry"] = np.array(geometry) * unit.angstrom
             # Element symbols are converted to atomic numbers
             # including an array of strings causes complications
             # when writing the hdf5 file.
             # data["elements"] = np.array(elements, dtype=str)
-            data["atomic numbers"] = np.array(atomic_numbers)
+            data["atomic_numbers"] = np.array(atomic_numbers)
             data["charges"] = np.array(charges) * unit.elementary_charge
 
             # remove the tag because it does not provide any useful information
@@ -276,7 +276,7 @@ def _parse_xyzfile(self, file_name: str) -> dict:
             for h in hvf_temp:
                 hvf.append(self._str_to_float(h))
 
-            data["harmonic vibrational frequencies"] = np.array(hvf) / unit.cm
+            data["harmonic_vibrational_frequencies"] = np.array(hvf) / unit.cm
 
             # if unit outputs were defined perform conversion
             if self.convert_units:
@@ -292,8 +292,29 @@ def _parse_xyzfile(self, file_name: str) -> dict:
         return data
 
     def _process_downloaded(
-        self, local_path_to_tar: str, name: str, unit_testing: bool
+        self,
+        local_path_to_tar: str,
+        name: str,
+        unit_testing_max_records: Optional[int] = None,
     ):
+        """
+        Processes a downloaded dataset: extracts relevant information and writes an hdf5 file.
+
+        Parameters
+        ----------
+        local_path_to_tar: str, required
+            Path to the tar.bz2 file.
+        name: str, required
+            name of the tar.bz2 file,
+        unit_testing_max_records: int, optional, default=None
+            If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests.
+
+        Examples
+        --------
+        >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset')
+        >>> qm9_data.process()
+
+        """
         # untar the dataset
         self._extract(
             file_path=f"{local_path_to_tar}/{name}",
@@ -307,8 +328,8 @@ def _process_downloaded(
         self.data = []
         for i, file in enumerate(tqdm(files, desc="processing", total=len(files))):
             # first 10 records
-            if unit_testing:
-                if i > 9:
+            if not unit_testing_max_records is None:
+                if i >= unit_testing_max_records:
                     break
 
             data_temp = self._parse_xyzfile(f"{self.local_cache_dir}/{file}")
@@ -322,7 +343,11 @@ def _process_downloaded(
         logger.debug("Writing HDF5 file.")
         dict_to_hdf5(full_output_path, self.data, id_key="name")
 
-    def process(self, force_download: bool = False, unit_testing: bool = False) -> None:
+    def process(
+        self,
+        force_download: bool = False,
+        unit_testing_max_records: Optional[int] = None,
+    ) -> None:
         """
         Downloads the dataset, extracts relevant information, and writes an hdf5 file.
 
@@ -331,9 +356,8 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
         force_download: bool, optional, default=False
             If the raw data_file is present in the local_cache_dir, the local copy will be used.
             If True, this will force the software to download the data again, even if present.
-        unit_testing: bool, optional, default=False
-            If True, only a subset (first 10 records) of the dataset will be used.
-            Primarily meant to ensure unit tests can be completed in a reasonable time period.
+        unit_testing_max_records: int, optional, default=None
+            If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests.
 
         Examples
         --------
@@ -352,7 +376,9 @@ def process(self, force_download: bool = False, unit_testing: bool = False) -> N
         # process the rest of the dataset
         if self.name is None:
             raise Exception("Failed to retrieve name of file from figshare.")
-        self._process_downloaded(self.local_cache_dir, self.name, unit_testing)
+        self._process_downloaded(
+            self.local_cache_dir, self.name, unit_testing_max_records
+        )
 
     def _generate_metadata(self):
         with open(
diff --git a/modelforge/curation/utils.py b/modelforge/curation/utils.py
index d863f59a..31a2cefa 100644
--- a/modelforge/curation/utils.py
+++ b/modelforge/curation/utils.py
@@ -1,8 +1,5 @@
-import h5py
 import pint
 from openff.units import unit, Quantity
-import numpy as np
-from tqdm import tqdm
 import os
 from loguru import logger
 
@@ -39,15 +36,23 @@ def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
 
     Examples
     --------
-    > dict_to_hdf5(file_name='qm9.hdf5', data=data, id_key='name')
+    >>> dict_to_hdf5(file_name='qm9.hdf5', data=data, id_key='name')
     """
+
+    import h5py
+    from tqdm import tqdm
+    import numpy as np
+
     assert file_name.endswith(".hdf5")
 
     dt = h5py.special_dtype(vlen=str)
 
     with h5py.File(file_name, "w") as f:
         for datapoint in tqdm(data):
-            record_name = datapoint[id_key]
+            try:
+                record_name = datapoint[id_key]
+            except Exception:
+                print(f"id_key {id_key} not found in the data.")
             group = f.create_group(record_name)
             for key, val in datapoint.items():
                 if key != id_key:
@@ -106,6 +111,7 @@ def download_from_figshare(url: str, output_path: str, force_download=False) ->
     """
 
     import requests
+    from tqdm import tqdm
 
     chunk_size = 512
 
diff --git a/modelforge/tests/test_curation.py b/modelforge/tests/test_curation.py
index 95a3c79a..309aab39 100644
--- a/modelforge/tests/test_curation.py
+++ b/modelforge/tests/test_curation.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 
 import numpy as np
+import h5py
+
 import pytest
 from openff.units import unit, Quantity
 import pint
@@ -84,12 +86,12 @@ def test_dict_to_hdf5(prep_temp_dir):
                 assert records[i][key] == test_data[i][key]
 
 
-def test_figshare_download(prep_temp_dir):
+def test_download_from_figshare(prep_temp_dir):
     url = "https://figshare.com/ndownloader/files/22247589"
     name = download_from_figshare(
         url=url,
         output_path=str(prep_temp_dir),
-        force_download=False,
+        force_download=True,
     )
 
     file_name_path = str(prep_temp_dir) + f"/{name}"
@@ -120,7 +122,15 @@ def test_mkdir(prep_temp_dir):
     assert status == False
 
 
-def test_qm9_curation_helper_functions(prep_temp_dir):
+def test_list_files(prep_temp_dir):
+    # test.hdf5 was generated in test_dict_to_hdf5
+    files = list_files(str(prep_temp_dir), ".hdf5")
+
+    # check to see if test.hdf5 is in the files
+    assert "test.hdf5" in files
+
+
+def test_qm9_curation_str_to_float(prep_temp_dir):
     qm9_data = QM9_curation(
         hdf5_file_name="qm9_dataset.hdf5",
         output_file_path=str(prep_temp_dir),
@@ -130,12 +140,19 @@ def test_qm9_curation_helper_functions(prep_temp_dir):
     val = qm9_data._str_to_float("1*^6")
     assert val == 1e6
 
-    # check the function to list directory contents
-    # test.hdf5 was generated in test_dict_to_hdf5
-    files = list_files(str(prep_temp_dir), ".hdf5")
 
-    # check to see if test.hdf5 is in the files
-    assert "test.hdf5" in files
+def test_qm9_curation_init_parameters(prep_temp_dir):
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_dataset.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+        convert_units=False,
+    )
+
+    assert qm9_data.hdf5_file_name == "qm9_dataset.hdf5"
+    assert qm9_data.output_file_path == str(prep_temp_dir)
+    assert qm9_data.local_cache_dir == str(prep_temp_dir)
+    assert qm9_data.convert_units == False
 
 
 def test_qm9_curation_parse_xyz(prep_temp_dir):
@@ -153,25 +170,26 @@ def test_qm9_curation_parse_xyz(prep_temp_dir):
     assert len(temp_dict) == 17
     assert temp_dict["tag"] == "gdb"
     assert temp_dict["idx"] == "1"
-    assert temp_dict["rotational constant A"] == 157.7 * unit.gigahertz
-    assert temp_dict["rotational constant B"] == 157.7 * unit.gigahertz
-    assert temp_dict["rotational constant C"] == 157.7 * unit.gigahertz
-    assert temp_dict["dipole moment"] == 0 * unit.debye
-    assert temp_dict["isotropic polarizability"] == 13.2 * unit.angstrom**3
-    assert temp_dict["energy of homo"] == -0.3 * unit.hartree
-    assert temp_dict["energy of lumo"] == 0.1 * unit.hartree
-    assert temp_dict["gap"] == 0.5 * unit.hartree
-    assert temp_dict["electronic spatial extent"] == 35.3 * unit.angstrom**2
-    assert temp_dict["zero point vibrational energy"] == 0.0 * unit.hartree
-    assert temp_dict["internal energy at 0K"] == -40.4 * unit.hartree
-    assert temp_dict["internal energy at 298.15K"] == -40.4 * unit.hartree
-    assert temp_dict["enthalpy at 298.15K"] == -40.4 * unit.hartree
-    assert temp_dict["free energy at 298.15K"] == -40.4 * unit.hartree
+    assert temp_dict["rotational_constant_A"] == 157.7 * unit.gigahertz
+    assert temp_dict["rotational_constant_B"] == 157.7 * unit.gigahertz
+    assert temp_dict["rotational_constant_C"] == 157.7 * unit.gigahertz
+    assert temp_dict["dipole_moment"] == 0 * unit.debye
+    assert temp_dict["isotropic_polarizability"] == 13.2 * unit.angstrom**3
+    assert temp_dict["energy_of_homo"] == -0.3 * unit.hartree
+    assert temp_dict["energy_of_lumo"] == 0.1 * unit.hartree
+    assert temp_dict["lumo-homo_gap"] == 0.5 * unit.hartree
+    assert temp_dict["electronic_spatial_extent"] == 35.3 * unit.angstrom**2
+    assert temp_dict["zero_point_vibrational_energy"] == 0.0 * unit.hartree
+    assert temp_dict["internal_energy_at_0K"] == -40.4 * unit.hartree
+    assert temp_dict["internal_energy_at_298.15K"] == -40.4 * unit.hartree
+    assert temp_dict["enthalpy_at_298.15K"] == -40.4 * unit.hartree
+    assert temp_dict["free_energy_at_298.15K"] == -40.4 * unit.hartree
     assert (
-        temp_dict["heat capacity at 298.15K"]
+        temp_dict["heat_capacity_at_298.15K"]
         == 6.4 * unit.calorie_per_mole / unit.kelvin
     )
 
+    # test parsing an entire file from our data directory with unit conversions
     fn = resources.files("modelforge").joinpath("tests", "data", "dsgdb9nsd_000001.xyz")
     data_dict_temp = qm9_data._parse_xyzfile(str(fn))
 
@@ -197,51 +215,53 @@ def test_qm9_curation_parse_xyz(prep_temp_dir):
         == np.array([-0.535689, 0.133921, 0.133922, 0.133923, 0.133923])
         * unit.elementary_charge
     )
-    assert data_dict_temp["isotropic polarizability"] == 13.21 * unit.angstroms**3
+    assert data_dict_temp["isotropic_polarizability"] == 13.21 * unit.angstroms**3
     assert (
-        data_dict_temp["energy of homo"]
+        data_dict_temp["energy_of_homo"]
         == -1017.9062102263447 * unit.kilojoule_per_mole
     )
     assert (
-        data_dict_temp["energy of lumo"] == 307.4460077830925 * unit.kilojoule_per_mole
+        data_dict_temp["energy_of_lumo"] == 307.4460077830925 * unit.kilojoule_per_mole
     )
-    assert data_dict_temp["gap"] == 1325.3522180094374 * unit.kilojoule_per_mole
-    assert data_dict_temp["electronic spatial extent"] == 35.3641 * unit.angstrom**2
     assert (
-        data_dict_temp["zero point vibrational energy"]
+        data_dict_temp["lumo-homo_gap"] == 1325.3522180094374 * unit.kilojoule_per_mole
+    )
+    assert data_dict_temp["electronic_spatial_extent"] == 35.3641 * unit.angstrom**2
+    assert (
+        data_dict_temp["zero_point_vibrational_energy"]
         == 117.4884833670846 * unit.kilojoule_per_mole
     )
     assert (
-        data_dict_temp["internal energy at 0K"]
+        data_dict_temp["internal_energy_at_0K"]
         == -106277.4161215308 * unit.kilojoule_per_mole
     )
     assert (
-        data_dict_temp["internal energy at 298.15K"]
+        data_dict_temp["internal_energy_at_298.15K"]
         == -106269.88618856476 * unit.kilojoule_per_mole
     )
     assert (
-        data_dict_temp["enthalpy at 298.15K"]
+        data_dict_temp["enthalpy_at_298.15K"]
         == -106267.40509140545 * unit.kilojoule_per_mole
     )
     assert (
-        data_dict_temp["free energy at 298.15K"]
+        data_dict_temp["free_energy_at_298.15K"]
         == -106329.05182294044 * unit.kilojoule_per_mole
     )
     assert (
-        data_dict_temp["heat capacity at 298.15K"]
+        data_dict_temp["heat_capacity_at_298.15K"]
         == 0.027066296000000004 * unit.kilojoule_per_mole / unit.kelvin
     )
-    assert np.all(data_dict_temp["atomic numbers"] == np.array([6, 1, 1, 1, 1]))
-    assert data_dict_temp["smiles gdb-17"] == "C"
-    assert data_dict_temp["smiles b3lyp"] == "C"
-    assert data_dict_temp["inchi Corina"] == "1S/CH4/h1H4"
-    assert data_dict_temp["inchi B3LYP"] == "1S/CH4/h1H4"
-    assert data_dict_temp["rotational constant A"] == 157.7118 * unit.gigahertz
-    assert data_dict_temp["rotational constant B"] == 157.70997 * unit.gigahertz
-    assert data_dict_temp["rotational constant C"] == 157.70699 * unit.gigahertz
-    assert data_dict_temp["dipole moment"] == 0.0 * unit.debye
+    assert np.all(data_dict_temp["atomic_numbers"] == np.array([6, 1, 1, 1, 1]))
+    assert data_dict_temp["smiles_gdb-17"] == "C"
+    assert data_dict_temp["smiles_b3lyp"] == "C"
+    assert data_dict_temp["inchi_Corina"] == "1S/CH4/h1H4"
+    assert data_dict_temp["inchi_B3LYP"] == "1S/CH4/h1H4"
+    assert data_dict_temp["rotational_constant_A"] == 157.7118 * unit.gigahertz
+    assert data_dict_temp["rotational_constant_B"] == 157.70997 * unit.gigahertz
+    assert data_dict_temp["rotational_constant_C"] == 157.70699 * unit.gigahertz
+    assert data_dict_temp["dipole_moment"] == 0.0 * unit.debye
     assert np.all(
-        data_dict_temp["harmonic vibrational frequencies"]
+        data_dict_temp["harmonic_vibrational_frequencies"]
         == np.array(
             [
                 1341.307,
@@ -260,23 +280,23 @@ def test_qm9_curation_parse_xyz(prep_temp_dir):
 
 
 def test_qm9_local_archive(prep_temp_dir):
-    # test file extraction, parsing, and generation of hdf5 file
-    # from a local archive.
+    # test file extraction, parsing, and generation of hdf5 file from a local archive.
     qm9_data = QM9_curation(
         hdf5_file_name="qm9_test10.hdf5",
         output_file_path=str(prep_temp_dir),
         local_cache_dir=str(prep_temp_dir),
     )
 
-    fn = resources.files("modelforge").joinpath("tests", "data")
+    local_data_path = resources.files("modelforge").joinpath("tests", "data")
+    # make sure the data archive exists
+    file_name_path = str(local_data_path) + "/first10.tar.bz2"
+    assert os.path.isfile(file_name_path)
 
-    qm9_data._process_downloaded(str(fn), "first10.tar.bz2", unit_testing=False)
+    # pass the local file to the process_downloaded function
+    qm9_data._process_downloaded(str(local_data_path), "first10.tar.bz2")
 
     assert len(qm9_data.data) == 10
 
-    file_name_path = str(fn) + "/first10.tar.bz2"
-    assert os.path.isfile(file_name_path)
-
     names = {
         "dsgdb9nsd_000001": -106277.4161215308,
         "dsgdb9nsd_000002": -148408.69593977975,
@@ -289,61 +309,19 @@ def test_qm9_local_archive(prep_temp_dir):
         "dsgdb9nsd_000009": -306158.32885940996,
         "dsgdb9nsd_000010": -348451.454977435,
     }
+    # output file
     file_name_path = str(prep_temp_dir) + "/qm9_test10.hdf5"
 
-    with h5py.File(file_name_path, "r") as hf:
-        for key in hf.keys():
-            # check record names
-            assert key in list(names.keys())
-            assert np.isclose(hf[key]["internal energy at 0K"][()], names[key])
-
-
-"""
-# I refactored the code such that the figshare downloader into a separate function
-# allowing us to test downloading on a smaller, more manageable file
-# Extraction and processing of the qm9 dataset is tested
-# based on .tar.bz2 file that exists in tests/data that contains only 10 
-# I'm going to leave this code in place, but commented out for now
-# as we will eventually want a non-CI testing suite.
-def test_qm9_curation(prep_temp_dir):
-    # this downloads the entire archive and extracts it
-    # but only processes the first 10 records
-    qm9_data = QM9_curation(
-        hdf5_file_name="qm9_dataset.hdf5",
-        output_file_path=str(prep_temp_dir),
-        local_cache_dir=str(prep_temp_dir),
-    )
-
-    # test all the functions will run
-    qm9_data.process(unit_testing=True)
-
-    name = "dsgdb9nsd.xyz.tar.bz2"
-
-    file_name_path = str(prep_temp_dir) + f"/{name}"
     assert os.path.isfile(file_name_path)
 
-    # ensure we processed 10 records
-    assert len(qm9_data.data) == 10
-
-    file_name_path = str(prep_temp_dir) + "/qm9_dataset.hdf5"
-    assert os.path.isfile(file_name_path)
-
-    names = {
-        "dsgdb9nsd_000001": -106277.4161215308,
-        "dsgdb9nsd_000002": -148408.69593977975,
-        "dsgdb9nsd_000003": -200600.51755556674,
-        "dsgdb9nsd_000004": -202973.24721725564,
-        "dsgdb9nsd_000005": -245252.87826713378,
-        "dsgdb9nsd_000006": -300576.6846578527,
-        "dsgdb9nsd_000007": -209420.75231941737,
-        "dsgdb9nsd_000008": -303715.5298633426,
-        "dsgdb9nsd_000009": -306158.32885940996,
-        "dsgdb9nsd_000010": -348451.454977435,
-    }
-
     with h5py.File(file_name_path, "r") as hf:
         for key in hf.keys():
             # check record names
             assert key in list(names.keys())
-            assert np.isclose(hf[key]["internal energy at 0K"][()], names[key])
-"""
+            assert np.isclose(hf[key]["internal_energy_at_0K"][()], names[key])
+
+    qm9_data._process_downloaded(
+        str(local_data_path), "first10.tar.bz2", unit_testing_max_records=5
+    )
+
+    assert len(qm9_data.data) == 5