diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml
index 8527b1db..bef01088 100644
--- a/devtools/conda-envs/test_env.yaml
+++ b/devtools/conda-envs/test_env.yaml
@@ -19,6 +19,8 @@ dependencies:
   - lightning
   - tensorboard
   - torchvision
+  - openff-units
+  - pint
   - ase
 
     # Testing
diff --git a/modelforge/curation/__init__.py b/modelforge/curation/__init__.py
new file mode 100644
index 00000000..0f89000c
--- /dev/null
+++ b/modelforge/curation/__init__.py
@@ -0,0 +1 @@
+"""modelforge dataset curation."""
diff --git a/modelforge/curation/ani_curation.py b/modelforge/curation/ani_curation.py
new file mode 100644
index 00000000..e69de29b
diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py
new file mode 100644
index 00000000..2cea83d0
--- /dev/null
+++ b/modelforge/curation/qm9_curation.py
@@ -0,0 +1,414 @@
+import requests
+from loguru import logger
+import os
+from tqdm import tqdm
+
+from typing import Optional
+from openff.units import unit, Quantity
+import pint
+import qcelemental as qcel
+
+import tarfile
+from modelforge.curation.utils import *
+import numpy as np
+
+
+class QM9_curation:
+    """
+    Routines to fetch and process the QM9 dataset into a curated hdf5 file.
+
+    The QM9 dataset includes 133,885 organic molecules with up to nine heavy atoms (CONF).
+    All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.
+
+    Citation: Ramakrishnan, R., Dral, P., Rupp, M. et al.
+                Quantum chemistry structures and properties of 134 kilo molecules.
+                Sci Data 1, 140022 (2014).
+                https://doi.org/10.1038/sdata.2014.22
+
+    DOI for dataset: 10.6084/m9.figshare.c.978904.v5
+
+    Parameters
+    ----------
+    hdf5_file_name: str, required
+        Name of the hdf5 file that will be generated.
+    output_file_path: str, optional, default='./'
+        Path to write the output hdf5 file.
+    local_cache_dir: str, optional, default='./qm9_datafiles'
+        Location to save downloaded dataset.
+    convert_units: bool, optional, default=True
+        Convert from [angstrom, hartree] (i.e., source units)
+        to [nanometer, kJ/mol]
+
+    Examples
+    --------
+    >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset')
+    >>> qm9_data.process()
+
+    """
+
+    def __init__(
+        self,
+        hdf5_file_name: str,
+        output_file_path: Optional[str] = "./",
+        local_cache_dir: Optional[str] = "./qm9_datafiles",
+        convert_units: Optional[bool] = True,
+    ):
+        self.local_cache_dir = local_cache_dir
+        self.output_file_path = output_file_path
+        self.hdf5_file_name = hdf5_file_name
+        self.convert_units = convert_units
+        self.dataset_download_url = (
+            "https://springernature.figshare.com/ndownloader/files/3195389"
+        )
+        # Below, we define key pieces of information related to the dataset in the form of a dict.
+        # `dataset_download_url` is only the only variable used by the code to fetch the data.
+        # All other data is metadata that will be used to generate a README to go along with
+        # the HDF5 dataset and to document the key info within the code.
+        self.dataset_description = {
+            "publication_doi": "10.1038/sdata.2014.22",
+            "figshare_dataset_doi": "10.6084/m9.figshare.c.978904.v5",
+            "figshare_dataset_url": "https://springernature.figshare.com/articles/dataset/Data_for_6095_constitutional_isomers_of_C7H10O2/1057646/2",
+            "dataset_download_url": "https://springernature.figshare.com/ndownloader/files/3195389",
+            "publication_citation": "Ramakrishnan, R., Dral, P., Rupp, M. et al. Quantum chemistry structures and properties of 134 kilo molecules. Sci Data 1, 140022 (2014).",
+            "dataset_citation": "Ramakrishnan, Raghunathan; Dral, Pavlo; Rupp, Matthias; Anatole von Lilienfeld, O. (2014). Quantum chemistry structures and properties of 134 kilo molecules. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.978904.v5",
+            "description": "QM9 Dataset: Includes 133,885 organic molecules with up to nine heavy atoms (CONF). All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.",
+        }
+        # if convert_units is True we will
+        # convert the following units
+        self.unit_output_dict = {
+            "geometry": unit.nanometer,
+            "energy_of_homo": unit.kilojoule_per_mole,
+            "energy_of_lumo": unit.kilojoule_per_mole,
+            "lumo-homo_gap": unit.kilojoule_per_mole,
+            "zero_point_vibrational_energy": unit.kilojoule_per_mole,
+            "internal_energy_at_0K": unit.kilojoule_per_mole,
+            "internal_energy_at_298.15K": unit.kilojoule_per_mole,
+            "enthalpy_at_298.15K": unit.kilojoule_per_mole,
+            "free_energy_at_298.15K": unit.kilojoule_per_mole,
+            "heat_capacity_at_298.15K": unit.kilojoule_per_mole / unit.kelvin,
+        }
+
+    def _extract(self, file_path: str, cache_directory: str) -> None:
+        """
+        Extract the contents of a tar.bz2 file.
+
+        Parameters
+        ----------
+        file_path: str, required
+            tar.bz2 to extract.
+        cache_directory: str, required
+            Location to save the contents from the tar.bz2 file
+        """
+        logger.debug(f"Extracting tar {file_path}.")
+
+        tar = tarfile.open(f"{file_path}", "r:bz2")
+        tar.extractall(cache_directory)
+        tar.close()
+
+    def _str_to_float(self, x: str) -> float:
+        """
+        Converts a string to float, changing Mathematica style scientific notion to python style.
+
+        For example, this will convert str(1*^-6) to float(1e-6).
+
+        Parameters
+        ----------
+        x : str, required
+            String to process.
+
+        Returns
+        -------
+        float
+            Float value of the string.
+        """
+        xf = float(x.replace("*^", "e"))
+        return xf
+
+    def _parse_properties(self, line: str) -> dict:
+        """
+        Parses the line in the xyz file that contains property information.
+
+        Properties
+        ----------
+        line: str, required
+            String to parse that contains property information. The structure of this line
+            following the description in the original manuscript (See tables 2 and 3).
+
+        Returns
+        -------
+        dict
+            Dictionary of properties, with units added when appropriate.
+        """
+
+        temp_prop = line.split()
+        # list of properties and their units in the order they appear in the file.
+
+        labels_and_units = [
+            ("tag", None),
+            ("idx", None),
+            ("rotational_constant_A", unit.gigahertz),
+            ("rotational_constant_B", unit.gigahertz),
+            ("rotational_constant_C", unit.gigahertz),
+            ("dipole_moment", unit.debye),
+            ("isotropic_polarizability", unit.angstrom**3),
+            ("energy_of_homo", unit.hartree),
+            ("energy_of_lumo", unit.hartree),
+            ("lumo-homo_gap", unit.hartree),
+            ("electronic_spatial_extent", unit.angstrom**2),
+            ("zero_point_vibrational_energy", unit.hartree),
+            ("internal_energy_at_0K", unit.hartree),
+            ("internal_energy_at_298.15K", unit.hartree),
+            ("enthalpy_at_298.15K", unit.hartree),
+            ("free_energy_at_298.15K", unit.hartree),
+            ("heat_capacity_at_298.15K", unit.calorie_per_mole / unit.kelvin),
+        ]
+
+        assert len(labels_and_units) == len(temp_prop)
+
+        data = {}
+        for prop, label_and_unit in zip(temp_prop, labels_and_units):
+            label, prop_unit = label_and_unit
+            if prop_unit is None:
+                data[label] = prop
+            else:
+                data[label] = self._str_to_float(prop) * prop_unit
+        return data
+
+    def _parse_xyzfile(self, file_name: str) -> dict:
+        """
+        Parses the file containing information for each molecule.
+
+        Structure of the file (based on tables 2 and 3 of the original manuscript):
+
+
+        Parameters
+        ----------
+        file_name: str, required
+            Name of the file to parse
+
+        Returns
+        -------
+            dict:
+                Dict of parsed properties.
+
+        File format info
+        ----------------
+
+        Line            Content
+        1               Number of atoms, n_a
+        2               Scalar properties (see below)
+        3,...mn_a+2     Element type, coords (x,y,z Ang) Mulliken partial charges (in e)
+        n_a+3           Harmonic vibrational frequencies (3n_a-5 or 3n_a-6 in cm^-1)
+        n_a+4           SMILES strings from GDB-17 and B3LYP relaxation
+        n_a+5           InChI strings for Corina and B3LYP geometries
+
+        Scalar properties:
+
+        #   Unit        Description
+        1   N/A         gdb9 string to facilitate extraction
+        2   N/A         Consecutive, 1-based integer identifier
+        3   GHz         Rotational constant A
+        4   GHz         Rotational constant B
+        5   GHz         Rotational constant C
+        6   D           Dipole moment
+        7   Ang^3       Isotropic polarizability
+        8   Ha          Energy of HOMO
+        9   Ha          Energy of LUMO
+        10  Ha          LUMO-HOMO gap
+        11  Ang^2       Electronic spatial extent
+        12  Ha          Zero point vibrational energy
+        13  Ha          Internal energy at 0K
+        14  Ha          Internal energy at 298.15K
+        15  Ha          Enthalpy at 298.15K
+        16  Ha          Free energy at 298.15K
+        17  cal/mol/K   Heat capacity at 298.15K
+
+        """
+
+        with open(file_name, "r") as file:
+            n_atoms = int(file.readline())
+            properties_temp = file.readline()
+            properties = self._parse_properties(properties_temp)
+            elements = []
+            atomic_numbers = []
+            geometry = []
+            charges = []
+            hvf = []
+            for i in range(n_atoms):
+                line = file.readline()
+                element, x, y, z, q = line.split()
+                elements.append(element)
+                atomic_numbers.append(qcel.periodictable.to_atomic_number(element))
+                temp = [
+                    self._str_to_float(x),
+                    self._str_to_float(y),
+                    self._str_to_float(z),
+                ]
+                geometry.append(temp)
+                charges.append(self._str_to_float(q))
+
+            hvf_temp = file.readline().split()
+
+            smiles = file.readline().split()
+            InChI = file.readline()
+
+            data = {}
+            data["name"] = file_name.split("/")[-1].split(".")[0]
+            data["smiles_gdb-17"] = smiles[0]
+            data["smiles_b3lyp"] = smiles[1]
+            data["inchi_Corina"] = InChI.split("\n")[0].split()[0].replace("InChI=", "")
+            data["inchi_B3LYP"] = InChI.split("\n")[0].split()[1].replace("InChI=", "")
+            data["geometry"] = np.array(geometry) * unit.angstrom
+            # Element symbols are converted to atomic numbers
+            # including an array of strings causes complications
+            # when writing the hdf5 file.
+            # data["elements"] = np.array(elements, dtype=str)
+            data["atomic_numbers"] = np.array(atomic_numbers)
+            data["charges"] = np.array(charges) * unit.elementary_charge
+
+            # remove the tag because it does not provide any useful information
+            properties.pop("tag")
+
+            # loop over remaining properties and add to the dict
+            for property, val in properties.items():
+                data[property] = val
+
+            for h in hvf_temp:
+                hvf.append(self._str_to_float(h))
+
+            data["harmonic_vibrational_frequencies"] = np.array(hvf) / unit.cm
+
+            # if unit outputs were defined perform conversion
+            if self.convert_units:
+                for key in data.keys():
+                    if key in self.unit_output_dict.keys():
+                        try:
+                            data[key] = data[key].to(self.unit_output_dict[key], "chem")
+                        except Exception:
+                            print(
+                                f"could not convert {key} with units {key.u} to {self.unit_output_dict[key]}"
+                            )
+
+        return data
+
+    def _process_downloaded(
+        self,
+        local_path_to_tar: str,
+        name: str,
+        unit_testing_max_records: Optional[int] = None,
+    ):
+        """
+        Processes a downloaded dataset: extracts relevant information and writes an hdf5 file.
+
+        Parameters
+        ----------
+        local_path_to_tar: str, required
+            Path to the tar.bz2 file.
+        name: str, required
+            name of the tar.bz2 file,
+        unit_testing_max_records: int, optional, default=None
+            If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests.
+
+        Examples
+        --------
+        >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset')
+        >>> qm9_data.process()
+
+        """
+        # untar the dataset
+        self._extract(
+            file_path=f"{local_path_to_tar}/{name}",
+            cache_directory=self.local_cache_dir,
+        )
+
+        # list the files in the directory to examine
+        files = list_files(directory=self.local_cache_dir, extension=".xyz")
+
+        # parse the information in each datat file, saving to a list of dicts, data
+        self.data = []
+        for i, file in enumerate(tqdm(files, desc="processing", total=len(files))):
+            # first 10 records
+            if not unit_testing_max_records is None:
+                if i >= unit_testing_max_records:
+                    break
+
+            data_temp = self._parse_xyzfile(f"{self.local_cache_dir}/{file}")
+            self.data.append(data_temp)
+
+        mkdir(self.output_file_path)
+
+        full_output_path = f"{self.output_file_path}/{self.hdf5_file_name}"
+
+        # generate the hdf5 file from the list of dicts
+        logger.debug("Writing HDF5 file.")
+        dict_to_hdf5(full_output_path, self.data, id_key="name")
+
+    def process(
+        self,
+        force_download: bool = False,
+        unit_testing_max_records: Optional[int] = None,
+    ) -> None:
+        """
+        Downloads the dataset, extracts relevant information, and writes an hdf5 file.
+
+        Parameters
+        ----------
+        force_download: bool, optional, default=False
+            If the raw data_file is present in the local_cache_dir, the local copy will be used.
+            If True, this will force the software to download the data again, even if present.
+        unit_testing_max_records: int, optional, default=None
+            If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests.
+
+        Examples
+        --------
+        >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset')
+        >>> qm9_data.process()
+
+        """
+        url = self.dataset_download_url
+
+        # download the dataset
+        self.name = download_from_figshare(
+            url=url,
+            output_path=self.local_cache_dir,
+            force_download=force_download,
+        )
+        # process the rest of the dataset
+        if self.name is None:
+            raise Exception("Failed to retrieve name of file from figshare.")
+        self._process_downloaded(
+            self.local_cache_dir, self.name, unit_testing_max_records
+        )
+
+    def _generate_metadata(self):
+        with open(
+            f"{self.output_file_path}/{self.hdf5_file_name}.metadata", "w"
+        ) as f_md:
+            f_md.write("Dataset Description:\n")
+            f_md.write(self.dataset_description["description"])
+            f_md.write("\n\nPublication Citation:\n")
+            f_md.write(self.dataset_description["publication_citation"])
+            f_md.write("\n\nPublication DOI:\n")
+            f_md.write(self.dataset_description["publication_doi"])
+            f_md.write("\n\nSource dataset DOI:\n")
+            f_md.write(self.dataset_description["figshare_dataset_url"])
+            f_md.write("\n\nSource dataset download URL:\n")
+            f_md.write(self.dataset_description["dataset_download_url"])
+
+            f_md.write("\n\nHDF5 dataset curated by modelforge:\n")
+            f_md.write(
+                "The top level of the HDF5 file contains entries for each record name.\n"
+            )
+            f_md.write(
+                "Each record contains the following data, where units, when appropriate, are stored as attributes.\n"
+            )
+            f_md.write("Unit naming conventions follow the openff-units package.\n\n")
+            f_md.write("property : type : units\n")
+            for key, val in self.data[0].items():
+                if isinstance(val, pint.Quantity):
+                    var_type = str(type(val.m).__name__)
+                    f_md.write(f"{key} : {var_type} : {val.u}\n")
+                else:
+                    var_type = str(type(val).__name__)
+
+                    f_md.write(f"{key} : {var_type} : N/A\n")
diff --git a/modelforge/curation/utils.py b/modelforge/curation/utils.py
new file mode 100644
index 00000000..31a2cefa
--- /dev/null
+++ b/modelforge/curation/utils.py
@@ -0,0 +1,184 @@
+import pint
+from openff.units import unit, Quantity
+import os
+from loguru import logger
+
+# define new context for converting energy to energy/mol
+
+c = unit.Context("chem")
+c.add_transformation(
+    "[force] * [length]",
+    "[force] * [length]/[substance]",
+    lambda unit, x: x * unit.avogadro_constant,
+)
+c.add_transformation(
+    "[force] * [length]/[substance]",
+    "[force] * [length]",
+    lambda unit, x: x / unit.avogadro_constant,
+)
+unit.add_context(c)
+
+
+def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None:
+    """
+    Writes an hdf5 file from a list of dicts.
+
+    This will include units, if provided as attributes.
+
+    Parameters
+    ----------
+    file_name: str, required
+        Name and path of hdf5 file to write.
+    data: list of dicts, required
+        List that contains dictionaries of properties for each molecule to write to file.
+    id_key: str, required
+        Name of the key in the dicts that uniquely describes each record.
+
+    Examples
+    --------
+    >>> dict_to_hdf5(file_name='qm9.hdf5', data=data, id_key='name')
+    """
+
+    import h5py
+    from tqdm import tqdm
+    import numpy as np
+
+    assert file_name.endswith(".hdf5")
+
+    dt = h5py.special_dtype(vlen=str)
+
+    with h5py.File(file_name, "w") as f:
+        for datapoint in tqdm(data):
+            try:
+                record_name = datapoint[id_key]
+            except Exception:
+                print(f"id_key {id_key} not found in the data.")
+            group = f.create_group(record_name)
+            for key, val in datapoint.items():
+                if key != id_key:
+                    if isinstance(val, pint.Quantity):
+                        val_m = val.m
+                        val_u = str(val.u)
+                    else:
+                        val_m = val
+                        val_u = None
+                    if isinstance(val_m, str):
+                        group.create_dataset(name=key, data=val_m, dtype=dt)
+                    elif isinstance(val_m, (float, int)):
+                        group.create_dataset(name=key, data=val_m)
+                    elif isinstance(val_m, np.ndarray):
+                        group.create_dataset(name=key, data=val_m, shape=val_m.shape)
+                    if not val_u is None:
+                        group[key].attrs["units"] = val_u
+
+
+def mkdir(path: str) -> bool:
+    if not os.path.exists(path):
+        try:
+            os.makedirs(path)
+            return True
+        except Exception:
+            print("Could not create directory {path}.")
+    else:
+        return False
+
+
+def download_from_figshare(url: str, output_path: str, force_download=False) -> str:
+    """
+    Downloads a dataset from figshare.
+
+    Parameters
+    ----------
+    ndownloader_url: str, required
+        Figshare url to the data downloader
+    output_path: str, required
+        Location to download the file to.
+    force_download: str, default=False
+        If False: if the file does not exist in output_path it will use the local version.
+        If True, the file will be downloaded even if it exists in output_path.
+
+    Returns
+    -------
+    str
+        Name of the file downloaded.
+
+    Examples
+    --------
+    >>> url = 'https://springernature.figshare.com/ndownloader/files/18112775'
+    >>> output_path = '/path/to/directory'
+    >>> downloaded_file_name = download_from_figshare(url, output_path)
+
+    """
+
+    import requests
+    from tqdm import tqdm
+
+    chunk_size = 512
+
+    # get the head of the request
+    head = requests.head(url)
+    # Because the url on figshare calls a downloader, instead of the direct file,
+    # we need to figure out where the original file is to know how big it is.
+    # Here we will parse the header info to get the file the downloader links to
+    # and then get the head info from this link to fetch the length.
+    # This is not actually necessary, but useful for updating download status bar.
+    # We also fetch the name of the file from the header of the download link
+    temp_url = head.headers["location"].split("?")[0]
+    name = head.headers["X-Filename"].split("/")[-1]
+
+    logger.debug(f"Downloading datafile from figshare to {output_path}/{name}.")
+
+    if not os.path.isfile(f"{output_path}/{name}") or force_download:
+        length = int(requests.head(temp_url).headers["Content-Length"])
+
+        r = requests.get(url, stream=True)
+
+        mkdir(output_path)
+
+        with open(f"{output_path}/{name}", "wb") as fd:
+            for chunk in tqdm(
+                r.iter_content(chunk_size=chunk_size),
+                ascii=True,
+                desc="downloading",
+                total=(int(length / chunk_size) + 1),
+            ):
+                fd.write(chunk)
+    else:  # if the file exists and we don't set force_download to True, just use the cached version
+        logger.debug(f"Datafile {name} already exists in {output_path}.")
+        logger.debug(
+            "Using already downloaded file; use force_download=True to re-download."
+        )
+
+    return name
+
+
+def list_files(directory: str, extension: str) -> list:
+    """
+    Returns a list of files in a directory with a given extension.
+
+    Parameters
+    ----------
+    directory: str, required
+        Directory of interest.
+    extension: str, required
+        Only consider files with this given file extension
+
+    Returns
+    -------
+    list
+        List of files in the given directory with desired extension.
+
+    Examples
+    --------
+    List only the xyz files in a test_directory
+    >>> files = list_files('test_directory', '.xyz')
+    """
+
+    logger.debug(f"Gathering {extension} files in {directory}.")
+
+    files = []
+    for file in os.listdir(directory):
+        if file.endswith(extension):
+            files.append(file)
+    files.sort()
+    return files
diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py
index 40af9452..208c6abb 100644
--- a/modelforge/dataset/dataset.py
+++ b/modelforge/dataset/dataset.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 from typing import Callable, Dict, List, Optional, Tuple
 
 import numpy as np
@@ -126,6 +127,7 @@ def _from_hdf5(self) -> None:
 
         import h5py
         import tqdm
+        import shutil
 
         logger.debug("Reading in and processing hdf5 file ...")
         # initialize dict with empty lists
@@ -134,13 +136,17 @@ def _from_hdf5(self) -> None:
             data[value] = []
 
         logger.debug(f"Processing and extracting data from {self.raw_data_file}")
-        with gzip.open(self.raw_data_file, "rb") as gz_file, h5py.File(
-            gz_file, "r"
-        ) as hf:
-            logger.debug(f"n_entries: {len(hf.keys())}")
-            for mol in tqdm.tqdm(list(hf.keys())):
-                for value in self.properties_of_interest:
-                    data[value].append(hf[mol][value][()])
+        # this will create an unzipped file which we can then load in
+        # this is substantially faster than passing gz_file directly to h5py.File()
+        with gzip.open(self.raw_data_file, "rb") as gz_file:
+            with open(self.raw_data_file.replace(".gz", ""), "wb") as out_file:
+                shutil.copyfileobj(gz_file, out_file)
+                with h5py.File(self.raw_data_file.replace(".gz", ""), "r") as hf:
+                    logger.debug(f"n_entries: {len(hf.keys())}")
+                    for mol in tqdm.tqdm(list(hf.keys())):
+                        for value in self.properties_of_interest:
+                            data[value].append(hf[mol][value][()])
+
         self.hdf5data = data
 
     def _from_file_cache(self) -> Dict[str, List]:
diff --git a/modelforge/dataset/qm9.py b/modelforge/dataset/qm9.py
index a935abd2..b6fa872b 100644
--- a/modelforge/dataset/qm9.py
+++ b/modelforge/dataset/qm9.py
@@ -67,7 +67,7 @@ def __init__(
             dataset_name = f"{dataset_name}_subset"
 
         super().__init__(
-            f"{local_cache_dir}/{dataset_name}_cache.hdf5",
+            f"{local_cache_dir}/{dataset_name}_cache.hdf5.gz",
             f"{local_cache_dir}/{dataset_name}_processed.npz",
         )
         self.dataset_name = dataset_name
diff --git a/modelforge/dataset/utils.py b/modelforge/dataset/utils.py
index c9ea2652..71b49158 100644
--- a/modelforge/dataset/utils.py
+++ b/modelforge/dataset/utils.py
@@ -139,7 +139,7 @@ def _download_from_gdrive(id: str, raw_dataset_file: str):
 
     Examples
     --------
-    >>> _download_from_gdrive("1v2gV3sG9JhMZ5QZn3gFB9j5ZIs0Xjxz8", "data_file.hdf5")
+    >>> _download_from_gdrive("1v2gV3sG9JhMZ5QZn3gFB9j5ZIs0Xjxz8", "data_file.hdf5.gz")
     """
     import gdown
 
diff --git a/modelforge/tests/data/dsgdb9nsd_000001.xyz b/modelforge/tests/data/dsgdb9nsd_000001.xyz
new file mode 100644
index 00000000..bd998ede
--- /dev/null
+++ b/modelforge/tests/data/dsgdb9nsd_000001.xyz
@@ -0,0 +1,10 @@
+5
+gdb 1	157.7118	157.70997	157.70699	0.	13.21	-0.3877	0.1171	0.5048	35.3641	0.044749	-40.47893	-40.476062	-40.475117	-40.498597	6.469	
+C	-0.0126981359	 1.0858041578	 0.0080009958	-0.535689
+H	 0.002150416	-0.0060313176	 0.0019761204	 0.133921
+H	 1.0117308433	 1.4637511618	 0.0002765748	 0.133922
+H	-0.540815069	 1.4475266138	-0.8766437152	 0.133923
+H	-0.5238136345	 1.4379326443	 0.9063972942	 0.133923
+1341.307	1341.3284	1341.365	1562.6731	1562.7453	3038.3205	3151.6034	3151.6788	3151.7078
+C	C	
+InChI=1S/CH4/h1H4	InChI=1S/CH4/h1H4
diff --git a/modelforge/tests/data/first10.tar.bz2 b/modelforge/tests/data/first10.tar.bz2
new file mode 100644
index 00000000..8426d518
Binary files /dev/null and b/modelforge/tests/data/first10.tar.bz2 differ
diff --git a/modelforge/tests/test_curation.py b/modelforge/tests/test_curation.py
new file mode 100644
index 00000000..309aab39
--- /dev/null
+++ b/modelforge/tests/test_curation.py
@@ -0,0 +1,327 @@
+import os
+import sys
+from pathlib import Path
+
+import numpy as np
+import h5py
+
+import pytest
+from openff.units import unit, Quantity
+import pint
+import importlib_resources as resources
+
+from modelforge.curation.utils import *
+
+from modelforge.curation.qm9_curation import *
+
+
+@pytest.fixture(scope="session")
+def prep_temp_dir(tmp_path_factory):
+    fn = tmp_path_factory.mktemp("hdf5_data")
+    return fn
+
+
+def test_dict_to_hdf5(prep_temp_dir):
+    # generate an hdf5 file from simple test data
+    # then read it in and see that we can reconstruct the same data
+    file_path = str(prep_temp_dir)
+    test_data = [
+        {
+            "name": "test1",
+            "energy": 123 * unit.hartree,
+            "geometry": np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) * unit.angstrom,
+        },
+        {
+            "name": "test2",
+            "energy": 456 * unit.hartree,
+            "geometry": np.array([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]]) * unit.angstrom,
+        },
+    ]
+    file_name_path = file_path + "/test.hdf5"
+    dict_to_hdf5(file_name=file_name_path, data=test_data, id_key="name")
+
+    # check we wrote the file
+    assert os.path.isfile(file_name_path)
+
+    # read in the hdf5 file
+    records = []
+    with h5py.File(file_name_path, "r") as hf:
+        test_names = list(hf.keys())
+
+        # validate names
+        assert test_names == ["test1", "test2"]
+
+        for name in test_names:
+            temp_record = {}
+            temp_record["name"] = name
+            properties = list(hf[name].keys())
+
+            for property in properties:
+                # validate properties name
+                assert property in ["energy", "geometry"]
+                if "units" in hf[name][property].attrs:
+                    u = hf[name][property].attrs["units"]
+                    temp_record[property] = hf[name][property][
+                        ()
+                    ] * unit.parse_expression(u)
+
+                else:
+                    temp_record[property] = hf[name][property][()]
+
+            records.append(temp_record)
+
+    # loop over reconstructed list of dictionaries and compare contents
+    for i in range(len(records)):
+        for key in test_data[i].keys():
+            if isinstance(records[i][key], pint.Quantity):
+                record_m = records[i][key].m
+                test_data_m = test_data[i][key].m
+
+                if isinstance(record_m, np.ndarray):
+                    print(record_m, test_data_m)
+                    assert np.all(record_m == test_data_m)
+                else:
+                    assert record_m == test_data_m
+            else:
+                assert records[i][key] == test_data[i][key]
+
+
+def test_download_from_figshare(prep_temp_dir):
+    url = "https://figshare.com/ndownloader/files/22247589"
+    name = download_from_figshare(
+        url=url,
+        output_path=str(prep_temp_dir),
+        force_download=True,
+    )
+
+    file_name_path = str(prep_temp_dir) + f"/{name}"
+    assert os.path.isfile(file_name_path)
+
+
+def test_mkdir(prep_temp_dir):
+    # thest the mkdir helper function that checks if a directory
+    # exists before creating
+    # the function returns a status: True if had to create the directory
+    # and False if it did not have to create the directory
+
+    new_dir = str(prep_temp_dir) + "/new_subdir"
+    # first assert the new directory does not exist
+    assert os.path.exists(new_dir) == False
+
+    # make the new directory and assert it now exists
+    status = mkdir(new_dir)
+    # if status == True, it created the directory because it didn't exist
+    assert status == True
+    assert os.path.exists(new_dir) == True
+
+    # try to make the directory even though it exists
+    # it should return False indicating it already existed
+    # and it did not try to create it
+    status = mkdir(new_dir)
+    assert os.path.exists(new_dir) == True
+    assert status == False
+
+
+def test_list_files(prep_temp_dir):
+    # test.hdf5 was generated in test_dict_to_hdf5
+    files = list_files(str(prep_temp_dir), ".hdf5")
+
+    # check to see if test.hdf5 is in the files
+    assert "test.hdf5" in files
+
+
+def test_qm9_curation_str_to_float(prep_temp_dir):
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_dataset.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+    )
+
+    val = qm9_data._str_to_float("1*^6")
+    assert val == 1e6
+
+
+def test_qm9_curation_init_parameters(prep_temp_dir):
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_dataset.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+        convert_units=False,
+    )
+
+    assert qm9_data.hdf5_file_name == "qm9_dataset.hdf5"
+    assert qm9_data.output_file_path == str(prep_temp_dir)
+    assert qm9_data.local_cache_dir == str(prep_temp_dir)
+    assert qm9_data.convert_units == False
+
+
+def test_qm9_curation_parse_xyz(prep_temp_dir):
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_dataset.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+    )
+
+    # check to ensure we can parse the properties line correctly
+    # This is data is modified from dsgdb9nsd_000001.xyz, with floats truncated to one decimal place
+    temp_line = "gdb 1  157.7  157.7  157.7  0.  13.2  -0.3  0.1  0.5  35.3  0.0  -40.4  -40.4  -40.4  -40.4  6.4"
+    temp_dict = qm9_data._parse_properties(temp_line)
+
+    assert len(temp_dict) == 17
+    assert temp_dict["tag"] == "gdb"
+    assert temp_dict["idx"] == "1"
+    assert temp_dict["rotational_constant_A"] == 157.7 * unit.gigahertz
+    assert temp_dict["rotational_constant_B"] == 157.7 * unit.gigahertz
+    assert temp_dict["rotational_constant_C"] == 157.7 * unit.gigahertz
+    assert temp_dict["dipole_moment"] == 0 * unit.debye
+    assert temp_dict["isotropic_polarizability"] == 13.2 * unit.angstrom**3
+    assert temp_dict["energy_of_homo"] == -0.3 * unit.hartree
+    assert temp_dict["energy_of_lumo"] == 0.1 * unit.hartree
+    assert temp_dict["lumo-homo_gap"] == 0.5 * unit.hartree
+    assert temp_dict["electronic_spatial_extent"] == 35.3 * unit.angstrom**2
+    assert temp_dict["zero_point_vibrational_energy"] == 0.0 * unit.hartree
+    assert temp_dict["internal_energy_at_0K"] == -40.4 * unit.hartree
+    assert temp_dict["internal_energy_at_298.15K"] == -40.4 * unit.hartree
+    assert temp_dict["enthalpy_at_298.15K"] == -40.4 * unit.hartree
+    assert temp_dict["free_energy_at_298.15K"] == -40.4 * unit.hartree
+    assert (
+        temp_dict["heat_capacity_at_298.15K"]
+        == 6.4 * unit.calorie_per_mole / unit.kelvin
+    )
+
+    # test parsing an entire file from our data directory with unit conversions
+    fn = resources.files("modelforge").joinpath("tests", "data", "dsgdb9nsd_000001.xyz")
+    data_dict_temp = qm9_data._parse_xyzfile(str(fn))
+
+    # spot check values
+    assert np.all(
+        np.isclose(
+            data_dict_temp["geometry"],
+            np.array(
+                [
+                    [-1.26981359e-03, 1.08580416e-01, 8.00099580e-04],
+                    [2.15041600e-04, -6.03131760e-04, 1.97612040e-04],
+                    [1.01173084e-01, 1.46375116e-01, 2.76574800e-05],
+                    [-5.40815069e-02, 1.44752661e-01, -8.76643715e-02],
+                    [-5.23813634e-02, 1.43793264e-01, 9.06397294e-02],
+                ]
+            )
+            * unit.nanometer,
+        )
+    )
+
+    assert np.all(
+        data_dict_temp["charges"]
+        == np.array([-0.535689, 0.133921, 0.133922, 0.133923, 0.133923])
+        * unit.elementary_charge
+    )
+    assert data_dict_temp["isotropic_polarizability"] == 13.21 * unit.angstroms**3
+    assert (
+        data_dict_temp["energy_of_homo"]
+        == -1017.9062102263447 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["energy_of_lumo"] == 307.4460077830925 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["lumo-homo_gap"] == 1325.3522180094374 * unit.kilojoule_per_mole
+    )
+    assert data_dict_temp["electronic_spatial_extent"] == 35.3641 * unit.angstrom**2
+    assert (
+        data_dict_temp["zero_point_vibrational_energy"]
+        == 117.4884833670846 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["internal_energy_at_0K"]
+        == -106277.4161215308 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["internal_energy_at_298.15K"]
+        == -106269.88618856476 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["enthalpy_at_298.15K"]
+        == -106267.40509140545 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["free_energy_at_298.15K"]
+        == -106329.05182294044 * unit.kilojoule_per_mole
+    )
+    assert (
+        data_dict_temp["heat_capacity_at_298.15K"]
+        == 0.027066296000000004 * unit.kilojoule_per_mole / unit.kelvin
+    )
+    assert np.all(data_dict_temp["atomic_numbers"] == np.array([6, 1, 1, 1, 1]))
+    assert data_dict_temp["smiles_gdb-17"] == "C"
+    assert data_dict_temp["smiles_b3lyp"] == "C"
+    assert data_dict_temp["inchi_Corina"] == "1S/CH4/h1H4"
+    assert data_dict_temp["inchi_B3LYP"] == "1S/CH4/h1H4"
+    assert data_dict_temp["rotational_constant_A"] == 157.7118 * unit.gigahertz
+    assert data_dict_temp["rotational_constant_B"] == 157.70997 * unit.gigahertz
+    assert data_dict_temp["rotational_constant_C"] == 157.70699 * unit.gigahertz
+    assert data_dict_temp["dipole_moment"] == 0.0 * unit.debye
+    assert np.all(
+        data_dict_temp["harmonic_vibrational_frequencies"]
+        == np.array(
+            [
+                1341.307,
+                1341.3284,
+                1341.365,
+                1562.6731,
+                1562.7453,
+                3038.3205,
+                3151.6034,
+                3151.6788,
+                3151.7078,
+            ]
+        )
+        / unit.centimeter
+    )
+
+
+def test_qm9_local_archive(prep_temp_dir):
+    # test file extraction, parsing, and generation of hdf5 file from a local archive.
+    qm9_data = QM9_curation(
+        hdf5_file_name="qm9_test10.hdf5",
+        output_file_path=str(prep_temp_dir),
+        local_cache_dir=str(prep_temp_dir),
+    )
+
+    local_data_path = resources.files("modelforge").joinpath("tests", "data")
+    # make sure the data archive exists
+    file_name_path = str(local_data_path) + "/first10.tar.bz2"
+    assert os.path.isfile(file_name_path)
+
+    # pass the local file to the process_downloaded function
+    qm9_data._process_downloaded(str(local_data_path), "first10.tar.bz2")
+
+    assert len(qm9_data.data) == 10
+
+    names = {
+        "dsgdb9nsd_000001": -106277.4161215308,
+        "dsgdb9nsd_000002": -148408.69593977975,
+        "dsgdb9nsd_000003": -200600.51755556674,
+        "dsgdb9nsd_000004": -202973.24721725564,
+        "dsgdb9nsd_000005": -245252.87826713378,
+        "dsgdb9nsd_000006": -300576.6846578527,
+        "dsgdb9nsd_000007": -209420.75231941737,
+        "dsgdb9nsd_000008": -303715.5298633426,
+        "dsgdb9nsd_000009": -306158.32885940996,
+        "dsgdb9nsd_000010": -348451.454977435,
+    }
+    # output file
+    file_name_path = str(prep_temp_dir) + "/qm9_test10.hdf5"
+
+    assert os.path.isfile(file_name_path)
+
+    with h5py.File(file_name_path, "r") as hf:
+        for key in hf.keys():
+            # check record names
+            assert key in list(names.keys())
+            assert np.isclose(hf[key]["internal_energy_at_0K"][()], names[key])
+
+    qm9_data._process_downloaded(
+        str(local_data_path), "first10.tar.bz2", unit_testing_max_records=5
+    )
+
+    assert len(qm9_data.data) == 5