diff --git a/devtools/conda-envs/test_env.yaml b/devtools/conda-envs/test_env.yaml index 8527b1db..bef01088 100644 --- a/devtools/conda-envs/test_env.yaml +++ b/devtools/conda-envs/test_env.yaml @@ -19,6 +19,8 @@ dependencies: - lightning - tensorboard - torchvision + - openff-units + - pint - ase # Testing diff --git a/modelforge/curation/__init__.py b/modelforge/curation/__init__.py new file mode 100644 index 00000000..0f89000c --- /dev/null +++ b/modelforge/curation/__init__.py @@ -0,0 +1 @@ +"""modelforge dataset curation.""" diff --git a/modelforge/curation/ani_curation.py b/modelforge/curation/ani_curation.py new file mode 100644 index 00000000..e69de29b diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py new file mode 100644 index 00000000..2cea83d0 --- /dev/null +++ b/modelforge/curation/qm9_curation.py @@ -0,0 +1,414 @@ +import requests +from loguru import logger +import os +from tqdm import tqdm + +from typing import Optional +from openff.units import unit, Quantity +import pint +import qcelemental as qcel + +import tarfile +from modelforge.curation.utils import * +import numpy as np + + +class QM9_curation: + """ + Routines to fetch and process the QM9 dataset into a curated hdf5 file. + + The QM9 dataset includes 133,885 organic molecules with up to nine heavy atoms (CONF). + All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry. + + Citation: Ramakrishnan, R., Dral, P., Rupp, M. et al. + Quantum chemistry structures and properties of 134 kilo molecules. + Sci Data 1, 140022 (2014). + https://doi.org/10.1038/sdata.2014.22 + + DOI for dataset: 10.6084/m9.figshare.c.978904.v5 + + Parameters + ---------- + hdf5_file_name: str, required + Name of the hdf5 file that will be generated. + output_file_path: str, optional, default='./' + Path to write the output hdf5 file. + local_cache_dir: str, optional, default='./qm9_datafiles' + Location to save downloaded dataset. + convert_units: bool, optional, default=True + Convert from [angstrom, hartree] (i.e., source units) + to [nanometer, kJ/mol] + + Examples + -------- + >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset') + >>> qm9_data.process() + + """ + + def __init__( + self, + hdf5_file_name: str, + output_file_path: Optional[str] = "./", + local_cache_dir: Optional[str] = "./qm9_datafiles", + convert_units: Optional[bool] = True, + ): + self.local_cache_dir = local_cache_dir + self.output_file_path = output_file_path + self.hdf5_file_name = hdf5_file_name + self.convert_units = convert_units + self.dataset_download_url = ( + "https://springernature.figshare.com/ndownloader/files/3195389" + ) + # Below, we define key pieces of information related to the dataset in the form of a dict. + # `dataset_download_url` is only the only variable used by the code to fetch the data. + # All other data is metadata that will be used to generate a README to go along with + # the HDF5 dataset and to document the key info within the code. + self.dataset_description = { + "publication_doi": "10.1038/sdata.2014.22", + "figshare_dataset_doi": "10.6084/m9.figshare.c.978904.v5", + "figshare_dataset_url": "https://springernature.figshare.com/articles/dataset/Data_for_6095_constitutional_isomers_of_C7H10O2/1057646/2", + "dataset_download_url": "https://springernature.figshare.com/ndownloader/files/3195389", + "publication_citation": "Ramakrishnan, R., Dral, P., Rupp, M. et al. Quantum chemistry structures and properties of 134 kilo molecules. Sci Data 1, 140022 (2014).", + "dataset_citation": "Ramakrishnan, Raghunathan; Dral, Pavlo; Rupp, Matthias; Anatole von Lilienfeld, O. (2014). Quantum chemistry structures and properties of 134 kilo molecules. figshare. Collection. https://doi.org/10.6084/m9.figshare.c.978904.v5", + "description": "QM9 Dataset: Includes 133,885 organic molecules with up to nine heavy atoms (CONF). All properties were calculated at the B3LYP/6-31G(2df,p) level of quantum chemistry.", + } + # if convert_units is True we will + # convert the following units + self.unit_output_dict = { + "geometry": unit.nanometer, + "energy_of_homo": unit.kilojoule_per_mole, + "energy_of_lumo": unit.kilojoule_per_mole, + "lumo-homo_gap": unit.kilojoule_per_mole, + "zero_point_vibrational_energy": unit.kilojoule_per_mole, + "internal_energy_at_0K": unit.kilojoule_per_mole, + "internal_energy_at_298.15K": unit.kilojoule_per_mole, + "enthalpy_at_298.15K": unit.kilojoule_per_mole, + "free_energy_at_298.15K": unit.kilojoule_per_mole, + "heat_capacity_at_298.15K": unit.kilojoule_per_mole / unit.kelvin, + } + + def _extract(self, file_path: str, cache_directory: str) -> None: + """ + Extract the contents of a tar.bz2 file. + + Parameters + ---------- + file_path: str, required + tar.bz2 to extract. + cache_directory: str, required + Location to save the contents from the tar.bz2 file + """ + logger.debug(f"Extracting tar {file_path}.") + + tar = tarfile.open(f"{file_path}", "r:bz2") + tar.extractall(cache_directory) + tar.close() + + def _str_to_float(self, x: str) -> float: + """ + Converts a string to float, changing Mathematica style scientific notion to python style. + + For example, this will convert str(1*^-6) to float(1e-6). + + Parameters + ---------- + x : str, required + String to process. + + Returns + ------- + float + Float value of the string. + """ + xf = float(x.replace("*^", "e")) + return xf + + def _parse_properties(self, line: str) -> dict: + """ + Parses the line in the xyz file that contains property information. + + Properties + ---------- + line: str, required + String to parse that contains property information. The structure of this line + following the description in the original manuscript (See tables 2 and 3). + + Returns + ------- + dict + Dictionary of properties, with units added when appropriate. + """ + + temp_prop = line.split() + # list of properties and their units in the order they appear in the file. + + labels_and_units = [ + ("tag", None), + ("idx", None), + ("rotational_constant_A", unit.gigahertz), + ("rotational_constant_B", unit.gigahertz), + ("rotational_constant_C", unit.gigahertz), + ("dipole_moment", unit.debye), + ("isotropic_polarizability", unit.angstrom**3), + ("energy_of_homo", unit.hartree), + ("energy_of_lumo", unit.hartree), + ("lumo-homo_gap", unit.hartree), + ("electronic_spatial_extent", unit.angstrom**2), + ("zero_point_vibrational_energy", unit.hartree), + ("internal_energy_at_0K", unit.hartree), + ("internal_energy_at_298.15K", unit.hartree), + ("enthalpy_at_298.15K", unit.hartree), + ("free_energy_at_298.15K", unit.hartree), + ("heat_capacity_at_298.15K", unit.calorie_per_mole / unit.kelvin), + ] + + assert len(labels_and_units) == len(temp_prop) + + data = {} + for prop, label_and_unit in zip(temp_prop, labels_and_units): + label, prop_unit = label_and_unit + if prop_unit is None: + data[label] = prop + else: + data[label] = self._str_to_float(prop) * prop_unit + return data + + def _parse_xyzfile(self, file_name: str) -> dict: + """ + Parses the file containing information for each molecule. + + Structure of the file (based on tables 2 and 3 of the original manuscript): + + + Parameters + ---------- + file_name: str, required + Name of the file to parse + + Returns + ------- + dict: + Dict of parsed properties. + + File format info + ---------------- + + Line Content + 1 Number of atoms, n_a + 2 Scalar properties (see below) + 3,...mn_a+2 Element type, coords (x,y,z Ang) Mulliken partial charges (in e) + n_a+3 Harmonic vibrational frequencies (3n_a-5 or 3n_a-6 in cm^-1) + n_a+4 SMILES strings from GDB-17 and B3LYP relaxation + n_a+5 InChI strings for Corina and B3LYP geometries + + Scalar properties: + + # Unit Description + 1 N/A gdb9 string to facilitate extraction + 2 N/A Consecutive, 1-based integer identifier + 3 GHz Rotational constant A + 4 GHz Rotational constant B + 5 GHz Rotational constant C + 6 D Dipole moment + 7 Ang^3 Isotropic polarizability + 8 Ha Energy of HOMO + 9 Ha Energy of LUMO + 10 Ha LUMO-HOMO gap + 11 Ang^2 Electronic spatial extent + 12 Ha Zero point vibrational energy + 13 Ha Internal energy at 0K + 14 Ha Internal energy at 298.15K + 15 Ha Enthalpy at 298.15K + 16 Ha Free energy at 298.15K + 17 cal/mol/K Heat capacity at 298.15K + + """ + + with open(file_name, "r") as file: + n_atoms = int(file.readline()) + properties_temp = file.readline() + properties = self._parse_properties(properties_temp) + elements = [] + atomic_numbers = [] + geometry = [] + charges = [] + hvf = [] + for i in range(n_atoms): + line = file.readline() + element, x, y, z, q = line.split() + elements.append(element) + atomic_numbers.append(qcel.periodictable.to_atomic_number(element)) + temp = [ + self._str_to_float(x), + self._str_to_float(y), + self._str_to_float(z), + ] + geometry.append(temp) + charges.append(self._str_to_float(q)) + + hvf_temp = file.readline().split() + + smiles = file.readline().split() + InChI = file.readline() + + data = {} + data["name"] = file_name.split("/")[-1].split(".")[0] + data["smiles_gdb-17"] = smiles[0] + data["smiles_b3lyp"] = smiles[1] + data["inchi_Corina"] = InChI.split("\n")[0].split()[0].replace("InChI=", "") + data["inchi_B3LYP"] = InChI.split("\n")[0].split()[1].replace("InChI=", "") + data["geometry"] = np.array(geometry) * unit.angstrom + # Element symbols are converted to atomic numbers + # including an array of strings causes complications + # when writing the hdf5 file. + # data["elements"] = np.array(elements, dtype=str) + data["atomic_numbers"] = np.array(atomic_numbers) + data["charges"] = np.array(charges) * unit.elementary_charge + + # remove the tag because it does not provide any useful information + properties.pop("tag") + + # loop over remaining properties and add to the dict + for property, val in properties.items(): + data[property] = val + + for h in hvf_temp: + hvf.append(self._str_to_float(h)) + + data["harmonic_vibrational_frequencies"] = np.array(hvf) / unit.cm + + # if unit outputs were defined perform conversion + if self.convert_units: + for key in data.keys(): + if key in self.unit_output_dict.keys(): + try: + data[key] = data[key].to(self.unit_output_dict[key], "chem") + except Exception: + print( + f"could not convert {key} with units {key.u} to {self.unit_output_dict[key]}" + ) + + return data + + def _process_downloaded( + self, + local_path_to_tar: str, + name: str, + unit_testing_max_records: Optional[int] = None, + ): + """ + Processes a downloaded dataset: extracts relevant information and writes an hdf5 file. + + Parameters + ---------- + local_path_to_tar: str, required + Path to the tar.bz2 file. + name: str, required + name of the tar.bz2 file, + unit_testing_max_records: int, optional, default=None + If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests. + + Examples + -------- + >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset') + >>> qm9_data.process() + + """ + # untar the dataset + self._extract( + file_path=f"{local_path_to_tar}/{name}", + cache_directory=self.local_cache_dir, + ) + + # list the files in the directory to examine + files = list_files(directory=self.local_cache_dir, extension=".xyz") + + # parse the information in each datat file, saving to a list of dicts, data + self.data = [] + for i, file in enumerate(tqdm(files, desc="processing", total=len(files))): + # first 10 records + if not unit_testing_max_records is None: + if i >= unit_testing_max_records: + break + + data_temp = self._parse_xyzfile(f"{self.local_cache_dir}/{file}") + self.data.append(data_temp) + + mkdir(self.output_file_path) + + full_output_path = f"{self.output_file_path}/{self.hdf5_file_name}" + + # generate the hdf5 file from the list of dicts + logger.debug("Writing HDF5 file.") + dict_to_hdf5(full_output_path, self.data, id_key="name") + + def process( + self, + force_download: bool = False, + unit_testing_max_records: Optional[int] = None, + ) -> None: + """ + Downloads the dataset, extracts relevant information, and writes an hdf5 file. + + Parameters + ---------- + force_download: bool, optional, default=False + If the raw data_file is present in the local_cache_dir, the local copy will be used. + If True, this will force the software to download the data again, even if present. + unit_testing_max_records: int, optional, default=None + If set to an integer, 'n', the routine will only process the first 'n' records, useful for unit tests. + + Examples + -------- + >>> qm9_data = QM9_curation(hdf5_file_name='qm9_dataset.hdf5', local_cache_dir='~/datasets/qm9_dataset') + >>> qm9_data.process() + + """ + url = self.dataset_download_url + + # download the dataset + self.name = download_from_figshare( + url=url, + output_path=self.local_cache_dir, + force_download=force_download, + ) + # process the rest of the dataset + if self.name is None: + raise Exception("Failed to retrieve name of file from figshare.") + self._process_downloaded( + self.local_cache_dir, self.name, unit_testing_max_records + ) + + def _generate_metadata(self): + with open( + f"{self.output_file_path}/{self.hdf5_file_name}.metadata", "w" + ) as f_md: + f_md.write("Dataset Description:\n") + f_md.write(self.dataset_description["description"]) + f_md.write("\n\nPublication Citation:\n") + f_md.write(self.dataset_description["publication_citation"]) + f_md.write("\n\nPublication DOI:\n") + f_md.write(self.dataset_description["publication_doi"]) + f_md.write("\n\nSource dataset DOI:\n") + f_md.write(self.dataset_description["figshare_dataset_url"]) + f_md.write("\n\nSource dataset download URL:\n") + f_md.write(self.dataset_description["dataset_download_url"]) + + f_md.write("\n\nHDF5 dataset curated by modelforge:\n") + f_md.write( + "The top level of the HDF5 file contains entries for each record name.\n" + ) + f_md.write( + "Each record contains the following data, where units, when appropriate, are stored as attributes.\n" + ) + f_md.write("Unit naming conventions follow the openff-units package.\n\n") + f_md.write("property : type : units\n") + for key, val in self.data[0].items(): + if isinstance(val, pint.Quantity): + var_type = str(type(val.m).__name__) + f_md.write(f"{key} : {var_type} : {val.u}\n") + else: + var_type = str(type(val).__name__) + + f_md.write(f"{key} : {var_type} : N/A\n") diff --git a/modelforge/curation/utils.py b/modelforge/curation/utils.py new file mode 100644 index 00000000..31a2cefa --- /dev/null +++ b/modelforge/curation/utils.py @@ -0,0 +1,184 @@ +import pint +from openff.units import unit, Quantity +import os +from loguru import logger + +# define new context for converting energy to energy/mol + +c = unit.Context("chem") +c.add_transformation( + "[force] * [length]", + "[force] * [length]/[substance]", + lambda unit, x: x * unit.avogadro_constant, +) +c.add_transformation( + "[force] * [length]/[substance]", + "[force] * [length]", + lambda unit, x: x / unit.avogadro_constant, +) +unit.add_context(c) + + +def dict_to_hdf5(file_name: str, data: list, id_key: str) -> None: + """ + Writes an hdf5 file from a list of dicts. + + This will include units, if provided as attributes. + + Parameters + ---------- + file_name: str, required + Name and path of hdf5 file to write. + data: list of dicts, required + List that contains dictionaries of properties for each molecule to write to file. + id_key: str, required + Name of the key in the dicts that uniquely describes each record. + + Examples + -------- + >>> dict_to_hdf5(file_name='qm9.hdf5', data=data, id_key='name') + """ + + import h5py + from tqdm import tqdm + import numpy as np + + assert file_name.endswith(".hdf5") + + dt = h5py.special_dtype(vlen=str) + + with h5py.File(file_name, "w") as f: + for datapoint in tqdm(data): + try: + record_name = datapoint[id_key] + except Exception: + print(f"id_key {id_key} not found in the data.") + group = f.create_group(record_name) + for key, val in datapoint.items(): + if key != id_key: + if isinstance(val, pint.Quantity): + val_m = val.m + val_u = str(val.u) + else: + val_m = val + val_u = None + if isinstance(val_m, str): + group.create_dataset(name=key, data=val_m, dtype=dt) + elif isinstance(val_m, (float, int)): + group.create_dataset(name=key, data=val_m) + elif isinstance(val_m, np.ndarray): + group.create_dataset(name=key, data=val_m, shape=val_m.shape) + if not val_u is None: + group[key].attrs["units"] = val_u + + +def mkdir(path: str) -> bool: + if not os.path.exists(path): + try: + os.makedirs(path) + return True + except Exception: + print("Could not create directory {path}.") + else: + return False + + +def download_from_figshare(url: str, output_path: str, force_download=False) -> str: + """ + Downloads a dataset from figshare. + + Parameters + ---------- + ndownloader_url: str, required + Figshare url to the data downloader + output_path: str, required + Location to download the file to. + force_download: str, default=False + If False: if the file does not exist in output_path it will use the local version. + If True, the file will be downloaded even if it exists in output_path. + + Returns + ------- + str + Name of the file downloaded. + + Examples + -------- + >>> url = 'https://springernature.figshare.com/ndownloader/files/18112775' + >>> output_path = '/path/to/directory' + >>> downloaded_file_name = download_from_figshare(url, output_path) + + """ + + import requests + from tqdm import tqdm + + chunk_size = 512 + + # get the head of the request + head = requests.head(url) + # Because the url on figshare calls a downloader, instead of the direct file, + # we need to figure out where the original file is to know how big it is. + # Here we will parse the header info to get the file the downloader links to + # and then get the head info from this link to fetch the length. + # This is not actually necessary, but useful for updating download status bar. + # We also fetch the name of the file from the header of the download link + temp_url = head.headers["location"].split("?")[0] + name = head.headers["X-Filename"].split("/")[-1] + + logger.debug(f"Downloading datafile from figshare to {output_path}/{name}.") + + if not os.path.isfile(f"{output_path}/{name}") or force_download: + length = int(requests.head(temp_url).headers["Content-Length"]) + + r = requests.get(url, stream=True) + + mkdir(output_path) + + with open(f"{output_path}/{name}", "wb") as fd: + for chunk in tqdm( + r.iter_content(chunk_size=chunk_size), + ascii=True, + desc="downloading", + total=(int(length / chunk_size) + 1), + ): + fd.write(chunk) + else: # if the file exists and we don't set force_download to True, just use the cached version + logger.debug(f"Datafile {name} already exists in {output_path}.") + logger.debug( + "Using already downloaded file; use force_download=True to re-download." + ) + + return name + + +def list_files(directory: str, extension: str) -> list: + """ + Returns a list of files in a directory with a given extension. + + Parameters + ---------- + directory: str, required + Directory of interest. + extension: str, required + Only consider files with this given file extension + + Returns + ------- + list + List of files in the given directory with desired extension. + + Examples + -------- + List only the xyz files in a test_directory + >>> files = list_files('test_directory', '.xyz') + """ + + logger.debug(f"Gathering {extension} files in {directory}.") + + files = [] + for file in os.listdir(directory): + if file.endswith(extension): + files.append(file) + files.sort() + return files diff --git a/modelforge/dataset/dataset.py b/modelforge/dataset/dataset.py index 40af9452..208c6abb 100644 --- a/modelforge/dataset/dataset.py +++ b/modelforge/dataset/dataset.py @@ -1,4 +1,5 @@ import os +import shutil from typing import Callable, Dict, List, Optional, Tuple import numpy as np @@ -126,6 +127,7 @@ def _from_hdf5(self) -> None: import h5py import tqdm + import shutil logger.debug("Reading in and processing hdf5 file ...") # initialize dict with empty lists @@ -134,13 +136,17 @@ def _from_hdf5(self) -> None: data[value] = [] logger.debug(f"Processing and extracting data from {self.raw_data_file}") - with gzip.open(self.raw_data_file, "rb") as gz_file, h5py.File( - gz_file, "r" - ) as hf: - logger.debug(f"n_entries: {len(hf.keys())}") - for mol in tqdm.tqdm(list(hf.keys())): - for value in self.properties_of_interest: - data[value].append(hf[mol][value][()]) + # this will create an unzipped file which we can then load in + # this is substantially faster than passing gz_file directly to h5py.File() + with gzip.open(self.raw_data_file, "rb") as gz_file: + with open(self.raw_data_file.replace(".gz", ""), "wb") as out_file: + shutil.copyfileobj(gz_file, out_file) + with h5py.File(self.raw_data_file.replace(".gz", ""), "r") as hf: + logger.debug(f"n_entries: {len(hf.keys())}") + for mol in tqdm.tqdm(list(hf.keys())): + for value in self.properties_of_interest: + data[value].append(hf[mol][value][()]) + self.hdf5data = data def _from_file_cache(self) -> Dict[str, List]: diff --git a/modelforge/dataset/qm9.py b/modelforge/dataset/qm9.py index a935abd2..b6fa872b 100644 --- a/modelforge/dataset/qm9.py +++ b/modelforge/dataset/qm9.py @@ -67,7 +67,7 @@ def __init__( dataset_name = f"{dataset_name}_subset" super().__init__( - f"{local_cache_dir}/{dataset_name}_cache.hdf5", + f"{local_cache_dir}/{dataset_name}_cache.hdf5.gz", f"{local_cache_dir}/{dataset_name}_processed.npz", ) self.dataset_name = dataset_name diff --git a/modelforge/dataset/utils.py b/modelforge/dataset/utils.py index c9ea2652..71b49158 100644 --- a/modelforge/dataset/utils.py +++ b/modelforge/dataset/utils.py @@ -139,7 +139,7 @@ def _download_from_gdrive(id: str, raw_dataset_file: str): Examples -------- - >>> _download_from_gdrive("1v2gV3sG9JhMZ5QZn3gFB9j5ZIs0Xjxz8", "data_file.hdf5") + >>> _download_from_gdrive("1v2gV3sG9JhMZ5QZn3gFB9j5ZIs0Xjxz8", "data_file.hdf5.gz") """ import gdown diff --git a/modelforge/tests/data/dsgdb9nsd_000001.xyz b/modelforge/tests/data/dsgdb9nsd_000001.xyz new file mode 100644 index 00000000..bd998ede --- /dev/null +++ b/modelforge/tests/data/dsgdb9nsd_000001.xyz @@ -0,0 +1,10 @@ +5 +gdb 1 157.7118 157.70997 157.70699 0. 13.21 -0.3877 0.1171 0.5048 35.3641 0.044749 -40.47893 -40.476062 -40.475117 -40.498597 6.469 +C -0.0126981359 1.0858041578 0.0080009958 -0.535689 +H 0.002150416 -0.0060313176 0.0019761204 0.133921 +H 1.0117308433 1.4637511618 0.0002765748 0.133922 +H -0.540815069 1.4475266138 -0.8766437152 0.133923 +H -0.5238136345 1.4379326443 0.9063972942 0.133923 +1341.307 1341.3284 1341.365 1562.6731 1562.7453 3038.3205 3151.6034 3151.6788 3151.7078 +C C +InChI=1S/CH4/h1H4 InChI=1S/CH4/h1H4 diff --git a/modelforge/tests/data/first10.tar.bz2 b/modelforge/tests/data/first10.tar.bz2 new file mode 100644 index 00000000..8426d518 Binary files /dev/null and b/modelforge/tests/data/first10.tar.bz2 differ diff --git a/modelforge/tests/test_curation.py b/modelforge/tests/test_curation.py new file mode 100644 index 00000000..309aab39 --- /dev/null +++ b/modelforge/tests/test_curation.py @@ -0,0 +1,327 @@ +import os +import sys +from pathlib import Path + +import numpy as np +import h5py + +import pytest +from openff.units import unit, Quantity +import pint +import importlib_resources as resources + +from modelforge.curation.utils import * + +from modelforge.curation.qm9_curation import * + + +@pytest.fixture(scope="session") +def prep_temp_dir(tmp_path_factory): + fn = tmp_path_factory.mktemp("hdf5_data") + return fn + + +def test_dict_to_hdf5(prep_temp_dir): + # generate an hdf5 file from simple test data + # then read it in and see that we can reconstruct the same data + file_path = str(prep_temp_dir) + test_data = [ + { + "name": "test1", + "energy": 123 * unit.hartree, + "geometry": np.array([[1.0, 1.0, 1.0], [1.0, 1.0, 1.0]]) * unit.angstrom, + }, + { + "name": "test2", + "energy": 456 * unit.hartree, + "geometry": np.array([[1.0, 1.0, 1.0], [2.0, 2.0, 2.0]]) * unit.angstrom, + }, + ] + file_name_path = file_path + "/test.hdf5" + dict_to_hdf5(file_name=file_name_path, data=test_data, id_key="name") + + # check we wrote the file + assert os.path.isfile(file_name_path) + + # read in the hdf5 file + records = [] + with h5py.File(file_name_path, "r") as hf: + test_names = list(hf.keys()) + + # validate names + assert test_names == ["test1", "test2"] + + for name in test_names: + temp_record = {} + temp_record["name"] = name + properties = list(hf[name].keys()) + + for property in properties: + # validate properties name + assert property in ["energy", "geometry"] + if "units" in hf[name][property].attrs: + u = hf[name][property].attrs["units"] + temp_record[property] = hf[name][property][ + () + ] * unit.parse_expression(u) + + else: + temp_record[property] = hf[name][property][()] + + records.append(temp_record) + + # loop over reconstructed list of dictionaries and compare contents + for i in range(len(records)): + for key in test_data[i].keys(): + if isinstance(records[i][key], pint.Quantity): + record_m = records[i][key].m + test_data_m = test_data[i][key].m + + if isinstance(record_m, np.ndarray): + print(record_m, test_data_m) + assert np.all(record_m == test_data_m) + else: + assert record_m == test_data_m + else: + assert records[i][key] == test_data[i][key] + + +def test_download_from_figshare(prep_temp_dir): + url = "https://figshare.com/ndownloader/files/22247589" + name = download_from_figshare( + url=url, + output_path=str(prep_temp_dir), + force_download=True, + ) + + file_name_path = str(prep_temp_dir) + f"/{name}" + assert os.path.isfile(file_name_path) + + +def test_mkdir(prep_temp_dir): + # thest the mkdir helper function that checks if a directory + # exists before creating + # the function returns a status: True if had to create the directory + # and False if it did not have to create the directory + + new_dir = str(prep_temp_dir) + "/new_subdir" + # first assert the new directory does not exist + assert os.path.exists(new_dir) == False + + # make the new directory and assert it now exists + status = mkdir(new_dir) + # if status == True, it created the directory because it didn't exist + assert status == True + assert os.path.exists(new_dir) == True + + # try to make the directory even though it exists + # it should return False indicating it already existed + # and it did not try to create it + status = mkdir(new_dir) + assert os.path.exists(new_dir) == True + assert status == False + + +def test_list_files(prep_temp_dir): + # test.hdf5 was generated in test_dict_to_hdf5 + files = list_files(str(prep_temp_dir), ".hdf5") + + # check to see if test.hdf5 is in the files + assert "test.hdf5" in files + + +def test_qm9_curation_str_to_float(prep_temp_dir): + qm9_data = QM9_curation( + hdf5_file_name="qm9_dataset.hdf5", + output_file_path=str(prep_temp_dir), + local_cache_dir=str(prep_temp_dir), + ) + + val = qm9_data._str_to_float("1*^6") + assert val == 1e6 + + +def test_qm9_curation_init_parameters(prep_temp_dir): + qm9_data = QM9_curation( + hdf5_file_name="qm9_dataset.hdf5", + output_file_path=str(prep_temp_dir), + local_cache_dir=str(prep_temp_dir), + convert_units=False, + ) + + assert qm9_data.hdf5_file_name == "qm9_dataset.hdf5" + assert qm9_data.output_file_path == str(prep_temp_dir) + assert qm9_data.local_cache_dir == str(prep_temp_dir) + assert qm9_data.convert_units == False + + +def test_qm9_curation_parse_xyz(prep_temp_dir): + qm9_data = QM9_curation( + hdf5_file_name="qm9_dataset.hdf5", + output_file_path=str(prep_temp_dir), + local_cache_dir=str(prep_temp_dir), + ) + + # check to ensure we can parse the properties line correctly + # This is data is modified from dsgdb9nsd_000001.xyz, with floats truncated to one decimal place + temp_line = "gdb 1 157.7 157.7 157.7 0. 13.2 -0.3 0.1 0.5 35.3 0.0 -40.4 -40.4 -40.4 -40.4 6.4" + temp_dict = qm9_data._parse_properties(temp_line) + + assert len(temp_dict) == 17 + assert temp_dict["tag"] == "gdb" + assert temp_dict["idx"] == "1" + assert temp_dict["rotational_constant_A"] == 157.7 * unit.gigahertz + assert temp_dict["rotational_constant_B"] == 157.7 * unit.gigahertz + assert temp_dict["rotational_constant_C"] == 157.7 * unit.gigahertz + assert temp_dict["dipole_moment"] == 0 * unit.debye + assert temp_dict["isotropic_polarizability"] == 13.2 * unit.angstrom**3 + assert temp_dict["energy_of_homo"] == -0.3 * unit.hartree + assert temp_dict["energy_of_lumo"] == 0.1 * unit.hartree + assert temp_dict["lumo-homo_gap"] == 0.5 * unit.hartree + assert temp_dict["electronic_spatial_extent"] == 35.3 * unit.angstrom**2 + assert temp_dict["zero_point_vibrational_energy"] == 0.0 * unit.hartree + assert temp_dict["internal_energy_at_0K"] == -40.4 * unit.hartree + assert temp_dict["internal_energy_at_298.15K"] == -40.4 * unit.hartree + assert temp_dict["enthalpy_at_298.15K"] == -40.4 * unit.hartree + assert temp_dict["free_energy_at_298.15K"] == -40.4 * unit.hartree + assert ( + temp_dict["heat_capacity_at_298.15K"] + == 6.4 * unit.calorie_per_mole / unit.kelvin + ) + + # test parsing an entire file from our data directory with unit conversions + fn = resources.files("modelforge").joinpath("tests", "data", "dsgdb9nsd_000001.xyz") + data_dict_temp = qm9_data._parse_xyzfile(str(fn)) + + # spot check values + assert np.all( + np.isclose( + data_dict_temp["geometry"], + np.array( + [ + [-1.26981359e-03, 1.08580416e-01, 8.00099580e-04], + [2.15041600e-04, -6.03131760e-04, 1.97612040e-04], + [1.01173084e-01, 1.46375116e-01, 2.76574800e-05], + [-5.40815069e-02, 1.44752661e-01, -8.76643715e-02], + [-5.23813634e-02, 1.43793264e-01, 9.06397294e-02], + ] + ) + * unit.nanometer, + ) + ) + + assert np.all( + data_dict_temp["charges"] + == np.array([-0.535689, 0.133921, 0.133922, 0.133923, 0.133923]) + * unit.elementary_charge + ) + assert data_dict_temp["isotropic_polarizability"] == 13.21 * unit.angstroms**3 + assert ( + data_dict_temp["energy_of_homo"] + == -1017.9062102263447 * unit.kilojoule_per_mole + ) + assert ( + data_dict_temp["energy_of_lumo"] == 307.4460077830925 * unit.kilojoule_per_mole + ) + assert ( + data_dict_temp["lumo-homo_gap"] == 1325.3522180094374 * unit.kilojoule_per_mole + ) + assert data_dict_temp["electronic_spatial_extent"] == 35.3641 * unit.angstrom**2 + assert ( + data_dict_temp["zero_point_vibrational_energy"] + == 117.4884833670846 * unit.kilojoule_per_mole + ) + assert ( + data_dict_temp["internal_energy_at_0K"] + == -106277.4161215308 * unit.kilojoule_per_mole + ) + assert ( + data_dict_temp["internal_energy_at_298.15K"] + == -106269.88618856476 * unit.kilojoule_per_mole + ) + assert ( + data_dict_temp["enthalpy_at_298.15K"] + == -106267.40509140545 * unit.kilojoule_per_mole + ) + assert ( + data_dict_temp["free_energy_at_298.15K"] + == -106329.05182294044 * unit.kilojoule_per_mole + ) + assert ( + data_dict_temp["heat_capacity_at_298.15K"] + == 0.027066296000000004 * unit.kilojoule_per_mole / unit.kelvin + ) + assert np.all(data_dict_temp["atomic_numbers"] == np.array([6, 1, 1, 1, 1])) + assert data_dict_temp["smiles_gdb-17"] == "C" + assert data_dict_temp["smiles_b3lyp"] == "C" + assert data_dict_temp["inchi_Corina"] == "1S/CH4/h1H4" + assert data_dict_temp["inchi_B3LYP"] == "1S/CH4/h1H4" + assert data_dict_temp["rotational_constant_A"] == 157.7118 * unit.gigahertz + assert data_dict_temp["rotational_constant_B"] == 157.70997 * unit.gigahertz + assert data_dict_temp["rotational_constant_C"] == 157.70699 * unit.gigahertz + assert data_dict_temp["dipole_moment"] == 0.0 * unit.debye + assert np.all( + data_dict_temp["harmonic_vibrational_frequencies"] + == np.array( + [ + 1341.307, + 1341.3284, + 1341.365, + 1562.6731, + 1562.7453, + 3038.3205, + 3151.6034, + 3151.6788, + 3151.7078, + ] + ) + / unit.centimeter + ) + + +def test_qm9_local_archive(prep_temp_dir): + # test file extraction, parsing, and generation of hdf5 file from a local archive. + qm9_data = QM9_curation( + hdf5_file_name="qm9_test10.hdf5", + output_file_path=str(prep_temp_dir), + local_cache_dir=str(prep_temp_dir), + ) + + local_data_path = resources.files("modelforge").joinpath("tests", "data") + # make sure the data archive exists + file_name_path = str(local_data_path) + "/first10.tar.bz2" + assert os.path.isfile(file_name_path) + + # pass the local file to the process_downloaded function + qm9_data._process_downloaded(str(local_data_path), "first10.tar.bz2") + + assert len(qm9_data.data) == 10 + + names = { + "dsgdb9nsd_000001": -106277.4161215308, + "dsgdb9nsd_000002": -148408.69593977975, + "dsgdb9nsd_000003": -200600.51755556674, + "dsgdb9nsd_000004": -202973.24721725564, + "dsgdb9nsd_000005": -245252.87826713378, + "dsgdb9nsd_000006": -300576.6846578527, + "dsgdb9nsd_000007": -209420.75231941737, + "dsgdb9nsd_000008": -303715.5298633426, + "dsgdb9nsd_000009": -306158.32885940996, + "dsgdb9nsd_000010": -348451.454977435, + } + # output file + file_name_path = str(prep_temp_dir) + "/qm9_test10.hdf5" + + assert os.path.isfile(file_name_path) + + with h5py.File(file_name_path, "r") as hf: + for key in hf.keys(): + # check record names + assert key in list(names.keys()) + assert np.isclose(hf[key]["internal_energy_at_0K"][()], names[key]) + + qm9_data._process_downloaded( + str(local_data_path), "first10.tar.bz2", unit_testing_max_records=5 + ) + + assert len(qm9_data.data) == 5