Skip to content

Commit

Permalink
Merge pull request #230 from chrisiacovella/refactor_downloader
Browse files Browse the repository at this point in the history
Refactor downloading in curation
  • Loading branch information
chrisiacovella authored Aug 15, 2024
2 parents 4711e7e + 420479f commit f2e71d6
Show file tree
Hide file tree
Showing 13 changed files with 69 additions and 377 deletions.
13 changes: 8 additions & 5 deletions modelforge/curation/ani1x_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ def _init_dataset_parameters(self):
self.dataset_md5_checksum = data_inputs[self.version_select][
"dataset_md5_checksum"
]
self.dataset_length = data_inputs[self.version_select]["dataset_length"]
self.dataset_filename = data_inputs[self.version_select]["dataset_filename"]
logger.debug(
f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}"
)
Expand Down Expand Up @@ -407,26 +409,27 @@ def process(
"max_records and total_conformers cannot be set at the same time."
)

from modelforge.utils.remote import download_from_figshare
from modelforge.utils.remote import download_from_url

url = self.dataset_download_url

# download the dataset
self.name = download_from_figshare(
download_from_url(
url=url,
md5_checksum=self.dataset_md5_checksum,
output_path=self.local_cache_dir,
output_filename=self.dataset_filename,
length=self.dataset_length,
force_download=force_download,
)

self._clear_data()

# process the rest of the dataset
if self.name is None:
raise Exception("Failed to retrieve name of file from figshare.")

self._process_downloaded(
self.local_cache_dir,
self.name,
self.dataset_filename,
max_records=max_records,
max_conformers_per_record=max_conformers_per_record,
total_conformers=total_conformers,
Expand Down
16 changes: 9 additions & 7 deletions modelforge/curation/ani2x_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ def _init_dataset_parameters(self) -> None:
self.dataset_md5_checksum = data_inputs[self.version_select][
"dataset_md5_checksum"
]
self.dataset_filename = data_inputs[self.version_select]["dataset_filename"]
self.dataset_length = data_inputs[self.version_select]["dataset_length"]

logger.debug(
f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}"
)
Expand Down Expand Up @@ -291,21 +294,20 @@ def process(
"max_records and total_conformers cannot be set at the same time."
)

from modelforge.utils.remote import download_from_zenodo
from modelforge.utils.remote import download_from_url

url = self.dataset_download_url

# download the dataset
self.name = download_from_zenodo(
download_from_url(
url=url,
md5_checksum=self.dataset_md5_checksum,
output_path=self.local_cache_dir,
output_filename=self.dataset_filename,
length=self.dataset_length,
force_download=force_download,
)

if self.name is None:
raise Exception("Failed to retrieve name of file from Zenodo.")

# clear any data that might be present so we don't append to it
self._clear_data()

Expand All @@ -314,13 +316,13 @@ def process(

extract_tarred_file(
input_path_dir=self.local_cache_dir,
file_name=self.name,
file_name=self.dataset_filename,
output_path_dir=self.local_cache_dir,
mode="r:gz",
)

# the untarred file will be in a directory named 'final_h5' within the local_cache_dir,
hdf5_filename = f"{self.name.replace('.tar.gz', '')}.h5"
hdf5_filename = f"{self.dataset_filename.replace('.tar.gz', '')}.h5"

# process the rest of the dataset
self._process_downloaded(
Expand Down
15 changes: 8 additions & 7 deletions modelforge/curation/qm9_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def _init_dataset_parameters(self) -> None:
self.dataset_md5_checksum = data_inputs[self.version_select][
"dataset_md5_checksum"
]
self.dataset_filename = data_inputs[self.version_select]["dataset_filename"]
self.dataset_length = data_inputs[self.version_select]["dataset_length"]

logger.debug(
f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}"
)
Expand Down Expand Up @@ -640,32 +643,30 @@ def process(
"max_records and total_conformers cannot be set at the same time."
)

from modelforge.utils.remote import download_from_figshare
from modelforge.utils.remote import download_from_url

url = self.dataset_download_url

# download the dataset
self.name = download_from_figshare(
download_from_url(
url=url,
md5_checksum=self.dataset_md5_checksum,
output_path=self.local_cache_dir,
output_filename=self.dataset_filename,
length=self.dataset_length,
force_download=force_download,
)
# clear out the data array before we process
self._clear_data()

# process the rest of the dataset
if self.name is None:
raise Exception("Failed to retrieve name of file from figshare.")

# untar the dataset
from modelforge.utils.misc import extract_tarred_file

# extract the tar.bz2 file into the local_cache_dir
# creating a directory called qm9_xyz_files to hold the contents
extract_tarred_file(
input_path_dir=self.local_cache_dir,
file_name=self.name,
file_name=self.dataset_filename,
output_path_dir=f"{self.local_cache_dir}/qm9_xyz_files",
mode="r:bz2",
)
Expand Down
2 changes: 1 addition & 1 deletion modelforge/curation/scripts/curate_spice114.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def spice114_wrapper(
"""
from modelforge.curation.spice_1_curation import SPICE1Curation

spice_114 = SPICE114Curation(
spice_114 = SPICE1Curation(
hdf5_file_name=hdf5_file_name,
output_file_dir=output_file_dir,
local_cache_dir=local_cache_dir,
Expand Down
14 changes: 9 additions & 5 deletions modelforge/curation/spice_1_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,9 @@ def _init_dataset_parameters(self):
self.dataset_md5_checksum = data_inputs[self.version_select][
"dataset_md5_checksum"
]
self.dataset_filename = data_inputs[self.version_select]["dataset_filename"]
self.dataset_length = data_inputs[self.version_select]["dataset_length"]

logger.debug(
f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}"
)
Expand Down Expand Up @@ -369,15 +372,17 @@ def process(
raise Exception(
"max_records and total_conformers cannot be set at the same time."
)
from modelforge.utils.remote import download_from_zenodo
from modelforge.utils.remote import download_from_url

url = self.dataset_download_url

# download the dataset
self.name = download_from_zenodo(
download_from_url(
url=url,
md5_checksum=self.dataset_md5_checksum,
output_path=self.local_cache_dir,
output_filename=self.dataset_filename,
length=self.dataset_length,
force_download=force_download,
)

Expand All @@ -394,11 +399,10 @@ def process(
self.atomic_numbers_to_limit = None

# process the rest of the dataset
if self.name is None:
raise Exception("Failed to retrieve name of file from zenodo.")

self._process_downloaded(
self.local_cache_dir,
self.name,
self.dataset_filename,
max_records,
max_conformers_per_record,
total_conformers,
Expand Down
14 changes: 9 additions & 5 deletions modelforge/curation/spice_2_curation.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,9 @@ def _init_dataset_parameters(self):
self.dataset_md5_checksum = data_inputs[self.version_select][
"dataset_md5_checksum"
]
self.dataset_filename = data_inputs[self.version_select]["dataset_filename"]
self.dataset_length = data_inputs[self.version_select]["dataset_length"]

logger.debug(
f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}"
)
Expand Down Expand Up @@ -367,15 +370,17 @@ def process(
raise ValueError(
"max_records and total_conformers cannot be set at the same time."
)
from modelforge.utils.remote import download_from_zenodo
from modelforge.utils.remote import download_from_url

url = self.dataset_download_url

# download the dataset
self.name = download_from_zenodo(
download_from_url(
url=url,
md5_checksum=self.dataset_md5_checksum,
output_path=self.local_cache_dir,
output_filename=self.dataset_filename,
length=self.dataset_length,
force_download=force_download,
)

Expand All @@ -393,11 +398,10 @@ def process(
self.atomic_numbers_to_limit = None

# process the rest of the dataset
if self.name is None:
raise Exception("Failed to retrieve name of file from zenodo.")

self._process_downloaded(
self.local_cache_dir,
self.name,
self.dataset_filename,
max_records,
max_conformers_per_record,
total_conformers,
Expand Down
2 changes: 2 additions & 0 deletions modelforge/curation/yaml_files/ani1x_curation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ v_0:
version: 0
dataset_download_url: https://springernature.figshare.com/ndownloader/files/18112775
dataset_md5_checksum: 98090dd6679106da861f52bed825ffb7
dataset_length: 5590846027
dataset_filename: ani1xrelease.h5
2 changes: 2 additions & 0 deletions modelforge/curation/yaml_files/ani2x_curation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,6 @@ v_0:
version: 0
dataset_download_url: https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz
dataset_md5_checksum: cb1d9effb3d07fc1cc6ced7cd0b1e1f2
dataset_length: 3705675413
dataset_filename: ANI-2x-wB97X-631Gd.tar.gz

2 changes: 2 additions & 0 deletions modelforge/curation/yaml_files/qm9_curation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@ v_0:
version: 0
dataset_download_url: https://springernature.figshare.com/ndownloader/files/3195389
dataset_md5_checksum: ad1ebd51ee7f5b3a6e32e974e5d54012
dataset_length: 86144227
dataset_filename: dsgdb9nsd.xyz.tar.bz2
2 changes: 2 additions & 0 deletions modelforge/curation/yaml_files/spice1_curation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ v_0:
notes: SPICE 1.1.4 release
dataset_download_url: https://zenodo.org/records/8222043/files/SPICE-1.1.4.hdf5
dataset_md5_checksum: f27d4c81da0e37d6547276bf6b4ae6a1
dataset_length: 16058156944
dataset_filename: SPICE-1.1.4.hdf5
2 changes: 2 additions & 0 deletions modelforge/curation/yaml_files/spice2_curation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,5 @@ v_0:
notes: SPICE 2.0.1 release
dataset_download_url: https://zenodo.org/records/10975225/files/SPICE-2.0.1.hdf5
dataset_md5_checksum: bfba2224b6540e1390a579569b475510
dataset_length: 37479271148
dataset_filename: SPICE-2.0.1.hdf5
Loading

0 comments on commit f2e71d6

Please sign in to comment.