diff --git a/modelforge/curation/ani1x_curation.py b/modelforge/curation/ani1x_curation.py index edab841a..382825ec 100644 --- a/modelforge/curation/ani1x_curation.py +++ b/modelforge/curation/ani1x_curation.py @@ -83,6 +83,8 @@ def _init_dataset_parameters(self): self.dataset_md5_checksum = data_inputs[self.version_select][ "dataset_md5_checksum" ] + self.dataset_length = data_inputs[self.version_select]["dataset_length"] + self.dataset_filename = data_inputs[self.version_select]["dataset_filename"] logger.debug( f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}" ) @@ -407,26 +409,27 @@ def process( "max_records and total_conformers cannot be set at the same time." ) - from modelforge.utils.remote import download_from_figshare + from modelforge.utils.remote import download_from_url url = self.dataset_download_url # download the dataset - self.name = download_from_figshare( + download_from_url( url=url, md5_checksum=self.dataset_md5_checksum, output_path=self.local_cache_dir, + output_filename=self.dataset_filename, + length=self.dataset_length, force_download=force_download, ) self._clear_data() # process the rest of the dataset - if self.name is None: - raise Exception("Failed to retrieve name of file from figshare.") + self._process_downloaded( self.local_cache_dir, - self.name, + self.dataset_filename, max_records=max_records, max_conformers_per_record=max_conformers_per_record, total_conformers=total_conformers, diff --git a/modelforge/curation/ani2x_curation.py b/modelforge/curation/ani2x_curation.py index ccddf0ea..09ecf9e9 100644 --- a/modelforge/curation/ani2x_curation.py +++ b/modelforge/curation/ani2x_curation.py @@ -69,6 +69,9 @@ def _init_dataset_parameters(self) -> None: self.dataset_md5_checksum = data_inputs[self.version_select][ "dataset_md5_checksum" ] + self.dataset_filename = data_inputs[self.version_select]["dataset_filename"] + self.dataset_length = data_inputs[self.version_select]["dataset_length"] + logger.debug( f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}" ) @@ -291,21 +294,20 @@ def process( "max_records and total_conformers cannot be set at the same time." ) - from modelforge.utils.remote import download_from_zenodo + from modelforge.utils.remote import download_from_url url = self.dataset_download_url # download the dataset - self.name = download_from_zenodo( + download_from_url( url=url, md5_checksum=self.dataset_md5_checksum, output_path=self.local_cache_dir, + output_filename=self.dataset_filename, + length=self.dataset_length, force_download=force_download, ) - if self.name is None: - raise Exception("Failed to retrieve name of file from Zenodo.") - # clear any data that might be present so we don't append to it self._clear_data() @@ -314,13 +316,13 @@ def process( extract_tarred_file( input_path_dir=self.local_cache_dir, - file_name=self.name, + file_name=self.dataset_filename, output_path_dir=self.local_cache_dir, mode="r:gz", ) # the untarred file will be in a directory named 'final_h5' within the local_cache_dir, - hdf5_filename = f"{self.name.replace('.tar.gz', '')}.h5" + hdf5_filename = f"{self.dataset_filename.replace('.tar.gz', '')}.h5" # process the rest of the dataset self._process_downloaded( diff --git a/modelforge/curation/qm9_curation.py b/modelforge/curation/qm9_curation.py index 847eaa01..67adde87 100644 --- a/modelforge/curation/qm9_curation.py +++ b/modelforge/curation/qm9_curation.py @@ -68,6 +68,9 @@ def _init_dataset_parameters(self) -> None: self.dataset_md5_checksum = data_inputs[self.version_select][ "dataset_md5_checksum" ] + self.dataset_filename = data_inputs[self.version_select]["dataset_filename"] + self.dataset_length = data_inputs[self.version_select]["dataset_length"] + logger.debug( f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}" ) @@ -640,24 +643,22 @@ def process( "max_records and total_conformers cannot be set at the same time." ) - from modelforge.utils.remote import download_from_figshare + from modelforge.utils.remote import download_from_url url = self.dataset_download_url # download the dataset - self.name = download_from_figshare( + download_from_url( url=url, md5_checksum=self.dataset_md5_checksum, output_path=self.local_cache_dir, + output_filename=self.dataset_filename, + length=self.dataset_length, force_download=force_download, ) # clear out the data array before we process self._clear_data() - # process the rest of the dataset - if self.name is None: - raise Exception("Failed to retrieve name of file from figshare.") - # untar the dataset from modelforge.utils.misc import extract_tarred_file @@ -665,7 +666,7 @@ def process( # creating a directory called qm9_xyz_files to hold the contents extract_tarred_file( input_path_dir=self.local_cache_dir, - file_name=self.name, + file_name=self.dataset_filename, output_path_dir=f"{self.local_cache_dir}/qm9_xyz_files", mode="r:bz2", ) diff --git a/modelforge/curation/scripts/curate_spice114.py b/modelforge/curation/scripts/curate_spice114.py index aa68d7d2..df2c6830 100644 --- a/modelforge/curation/scripts/curate_spice114.py +++ b/modelforge/curation/scripts/curate_spice114.py @@ -68,7 +68,7 @@ def spice114_wrapper( """ from modelforge.curation.spice_1_curation import SPICE1Curation - spice_114 = SPICE114Curation( + spice_114 = SPICE1Curation( hdf5_file_name=hdf5_file_name, output_file_dir=output_file_dir, local_cache_dir=local_cache_dir, diff --git a/modelforge/curation/spice_1_curation.py b/modelforge/curation/spice_1_curation.py index 62ada82e..5d243d2c 100644 --- a/modelforge/curation/spice_1_curation.py +++ b/modelforge/curation/spice_1_curation.py @@ -68,6 +68,9 @@ def _init_dataset_parameters(self): self.dataset_md5_checksum = data_inputs[self.version_select][ "dataset_md5_checksum" ] + self.dataset_filename = data_inputs[self.version_select]["dataset_filename"] + self.dataset_length = data_inputs[self.version_select]["dataset_length"] + logger.debug( f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}" ) @@ -369,15 +372,17 @@ def process( raise Exception( "max_records and total_conformers cannot be set at the same time." ) - from modelforge.utils.remote import download_from_zenodo + from modelforge.utils.remote import download_from_url url = self.dataset_download_url # download the dataset - self.name = download_from_zenodo( + download_from_url( url=url, md5_checksum=self.dataset_md5_checksum, output_path=self.local_cache_dir, + output_filename=self.dataset_filename, + length=self.dataset_length, force_download=force_download, ) @@ -394,11 +399,10 @@ def process( self.atomic_numbers_to_limit = None # process the rest of the dataset - if self.name is None: - raise Exception("Failed to retrieve name of file from zenodo.") + self._process_downloaded( self.local_cache_dir, - self.name, + self.dataset_filename, max_records, max_conformers_per_record, total_conformers, diff --git a/modelforge/curation/spice_2_curation.py b/modelforge/curation/spice_2_curation.py index 452b34b1..2db87fd9 100644 --- a/modelforge/curation/spice_2_curation.py +++ b/modelforge/curation/spice_2_curation.py @@ -65,6 +65,9 @@ def _init_dataset_parameters(self): self.dataset_md5_checksum = data_inputs[self.version_select][ "dataset_md5_checksum" ] + self.dataset_filename = data_inputs[self.version_select]["dataset_filename"] + self.dataset_length = data_inputs[self.version_select]["dataset_length"] + logger.debug( f"Dataset: {self.version_select} version: {data_inputs[self.version_select]['version']}" ) @@ -367,15 +370,17 @@ def process( raise ValueError( "max_records and total_conformers cannot be set at the same time." ) - from modelforge.utils.remote import download_from_zenodo + from modelforge.utils.remote import download_from_url url = self.dataset_download_url # download the dataset - self.name = download_from_zenodo( + download_from_url( url=url, md5_checksum=self.dataset_md5_checksum, output_path=self.local_cache_dir, + output_filename=self.dataset_filename, + length=self.dataset_length, force_download=force_download, ) @@ -393,11 +398,10 @@ def process( self.atomic_numbers_to_limit = None # process the rest of the dataset - if self.name is None: - raise Exception("Failed to retrieve name of file from zenodo.") + self._process_downloaded( self.local_cache_dir, - self.name, + self.dataset_filename, max_records, max_conformers_per_record, total_conformers, diff --git a/modelforge/curation/yaml_files/ani1x_curation.yaml b/modelforge/curation/yaml_files/ani1x_curation.yaml index 5c933eba..8ca1c3cc 100644 --- a/modelforge/curation/yaml_files/ani1x_curation.yaml +++ b/modelforge/curation/yaml_files/ani1x_curation.yaml @@ -4,3 +4,5 @@ v_0: version: 0 dataset_download_url: https://springernature.figshare.com/ndownloader/files/18112775 dataset_md5_checksum: 98090dd6679106da861f52bed825ffb7 + dataset_length: 5590846027 + dataset_filename: ani1xrelease.h5 diff --git a/modelforge/curation/yaml_files/ani2x_curation.yaml b/modelforge/curation/yaml_files/ani2x_curation.yaml index eb24880c..8a494f73 100644 --- a/modelforge/curation/yaml_files/ani2x_curation.yaml +++ b/modelforge/curation/yaml_files/ani2x_curation.yaml @@ -4,4 +4,6 @@ v_0: version: 0 dataset_download_url: https://zenodo.org/records/10108942/files/ANI-2x-wB97X-631Gd.tar.gz dataset_md5_checksum: cb1d9effb3d07fc1cc6ced7cd0b1e1f2 + dataset_length: 3705675413 + dataset_filename: ANI-2x-wB97X-631Gd.tar.gz diff --git a/modelforge/curation/yaml_files/qm9_curation.yaml b/modelforge/curation/yaml_files/qm9_curation.yaml index ebd342d2..9b05555c 100644 --- a/modelforge/curation/yaml_files/qm9_curation.yaml +++ b/modelforge/curation/yaml_files/qm9_curation.yaml @@ -4,3 +4,5 @@ v_0: version: 0 dataset_download_url: https://springernature.figshare.com/ndownloader/files/3195389 dataset_md5_checksum: ad1ebd51ee7f5b3a6e32e974e5d54012 + dataset_length: 86144227 + dataset_filename: dsgdb9nsd.xyz.tar.bz2 diff --git a/modelforge/curation/yaml_files/spice1_curation.yaml b/modelforge/curation/yaml_files/spice1_curation.yaml index 6285f26a..4be97814 100644 --- a/modelforge/curation/yaml_files/spice1_curation.yaml +++ b/modelforge/curation/yaml_files/spice1_curation.yaml @@ -5,3 +5,5 @@ v_0: notes: SPICE 1.1.4 release dataset_download_url: https://zenodo.org/records/8222043/files/SPICE-1.1.4.hdf5 dataset_md5_checksum: f27d4c81da0e37d6547276bf6b4ae6a1 + dataset_length: 16058156944 + dataset_filename: SPICE-1.1.4.hdf5 diff --git a/modelforge/curation/yaml_files/spice2_curation.yaml b/modelforge/curation/yaml_files/spice2_curation.yaml index 645cbd60..6ed537c0 100644 --- a/modelforge/curation/yaml_files/spice2_curation.yaml +++ b/modelforge/curation/yaml_files/spice2_curation.yaml @@ -5,3 +5,5 @@ v_0: notes: SPICE 2.0.1 release dataset_download_url: https://zenodo.org/records/10975225/files/SPICE-2.0.1.hdf5 dataset_md5_checksum: bfba2224b6540e1390a579569b475510 + dataset_length: 37479271148 + dataset_filename: SPICE-2.0.1.hdf5 diff --git a/modelforge/tests/test_remote.py b/modelforge/tests/test_remote.py index fbb1d1c0..174e9fd1 100644 --- a/modelforge/tests/test_remote.py +++ b/modelforge/tests/test_remote.py @@ -27,18 +27,20 @@ def test_is_url(): def test_download_from_url(prep_temp_dir): - url = "https://raw.githubusercontent.com/choderalab/modelforge/e3e65e15e23ccc55d03dd7abb4b9add7a7dd15c3/modelforge/modelforge.py" - checksum = "66ec18ca5db3df5791ff1ffc584363a8" + url = "https://zenodo.org/records/3401581/files/PTC-CMC/atools_ml-v0.1.zip" + checksum = "194cde222565dca8657d8521e5df1fd8" + + name = "atools_ml-v0.1.zip" # Download the file download_from_url( url, md5_checksum=checksum, output_path=str(prep_temp_dir), - output_filename="modelforge.py", + output_filename=name, force_download=True, ) - file_name_path = str(prep_temp_dir) + "/modelforge.py" + file_name_path = str(prep_temp_dir) + f"/{name}" assert os.path.isfile(file_name_path) # create a dummy document to test the case where @@ -51,11 +53,11 @@ def test_download_from_url(prep_temp_dir): url, md5_checksum=checksum, output_path=str(prep_temp_dir), - output_filename="modelforge.py", + output_filename=name, force_download=False, ) - file_name_path = str(prep_temp_dir) + "/modelforge.py" + file_name_path = str(prep_temp_dir) + f"/{name}" assert os.path.isfile(file_name_path) # let us change the expected checksum to cause a failure @@ -66,62 +68,7 @@ def test_download_from_url(prep_temp_dir): url, md5_checksum="checksum_garbage", output_path=str(prep_temp_dir), - output_filename="modelforge.py", - force_download=True, - ) - - -@pytest.mark.skip( - reason="This test seems to time out on the CI frequently. Will be refactoring and not need this soon." -) -def test_download_from_figshare(prep_temp_dir): - url = "https://figshare.com/ndownloader/files/22247589" - name = download_from_figshare( - url=url, - md5_checksum="c1459c5ddce7bb94800032aa3d04788e", - output_path=str(prep_temp_dir), - force_download=True, - ) - - file_name_path = str(prep_temp_dir) + f"/{name}" - assert os.path.isfile(file_name_path) - - # create a dummy document to test the case where - # the checksum doesn't match so it will redownload - with open(file_name_path, "w") as f: - f.write("dummy document") - - # This will force a download because the checksum doesn't match - url = "https://figshare.com/ndownloader/files/22247589" - name = download_from_figshare( - url=url, - md5_checksum="c1459c5ddce7bb94800032aa3d04788e", - output_path=str(prep_temp_dir), - force_download=False, - ) - - file_name_path = str(prep_temp_dir) + f"/{name}" - assert os.path.isfile(file_name_path) - - # the length of this file isn't listed in the headers - # this will check to make sure we can handle this case - url = "https://figshare.com/ndownloader/files/30975751" - name = download_from_figshare( - url=url, - md5_checksum="efa40abff1f71c121f6f0d444c18d5b3", - output_path=str(prep_temp_dir), - force_download=True, - ) - - file_name_path = str(prep_temp_dir) + f"/{name}" - assert os.path.isfile(file_name_path) - - with pytest.raises(Exception): - url = "https://choderalab.com/ndownloader/files/22247589" - name = download_from_figshare( - url=url, - md5_checksum="c1459c5ddce7bb94800032aa3d04788e", - output_path=str(prep_temp_dir), + output_filename=name, force_download=True, ) @@ -137,56 +84,16 @@ def test_fetch_record_id(): fetch_url_from_doi(doi="10.5281/zenodo.3588339", timeout=0.0000000000001) -def test_download_from_zenodo(prep_temp_dir): - url = "https://zenodo.org/records/3401581/files/PTC-CMC/atools_ml-v0.1.zip" - zenodo_checksum = "194cde222565dca8657d8521e5df1fd8" - name = download_from_zenodo( - url=url, - md5_checksum=zenodo_checksum, - output_path=str(prep_temp_dir), - force_download=True, - ) - - file_name_path = str(prep_temp_dir) + f"/{name}" - assert os.path.isfile(file_name_path) - - # create a dummy document to test the case where - # the checksum doesn't match so it will redownload - with open(file_name_path, "w") as f: - f.write("dummy document") - - # make sure that we redownload the file because the checksum of the - # existing file doesn't match - url = "https://zenodo.org/records/3401581/files/PTC-CMC/atools_ml-v0.1.zip" - zenodo_checksum = "194cde222565dca8657d8521e5df1fd8" - name = download_from_zenodo( - url=url, - md5_checksum=zenodo_checksum, - output_path=str(prep_temp_dir), - force_download=False, - ) - - file_name_path = str(prep_temp_dir) + f"/{name}" - assert os.path.isfile(file_name_path) - - with pytest.raises(Exception): - url = "https://choderalab.com/22247589" - name = download_from_zenodo( - url=url, - md5_checksum=zenodo_checksum, - output_path=str(prep_temp_dir), - force_download=True, - ) - - def test_md5_calculation(prep_temp_dir): url = "https://zenodo.org/records/3401581/files/PTC-CMC/atools_ml-v0.1.zip" zenodo_checksum = "194cde222565dca8657d8521e5df1fd8" - name = download_from_zenodo( + name = "atools_ml-v0.1.zip" + download_from_url( url=url, md5_checksum=zenodo_checksum, output_path=str(prep_temp_dir), + output_filename=name, force_download=True, ) @@ -199,9 +106,10 @@ def test_md5_calculation(prep_temp_dir): with pytest.raises(Exception): bad_checksum = "294badmd5checksumthatwontwork9de" - name = download_from_zenodo( + download_from_url( url=url, md5_checksum=bad_checksum, output_path=str(prep_temp_dir), + output_filename=name, force_download=True, ) diff --git a/modelforge/utils/remote.py b/modelforge/utils/remote.py index 2e09246f..a3d3f237 100644 --- a/modelforge/utils/remote.py +++ b/modelforge/utils/remote.py @@ -111,7 +111,7 @@ def download_from_url( output_filename: str, length: Optional[int] = None, force_download=False, -) -> str: +): import requests import os @@ -173,243 +173,3 @@ def download_from_url( logger.debug( "Using previously downloaded file; set force_download=True to re-download." ) - - -# Figshare helper functions -def download_from_figshare( - url: str, md5_checksum: str, output_path: str, force_download=False -) -> str: - """ - Downloads a dataset from figshare for a given ndownloader url. - - Parameters - ---------- - url: str, required - Figshare ndownloader url (i.e., link to the data downloader) - md5_checksum: str, required - Expected md5 checksum of the downloaded file. - output_path: str, required - Location to download the file to. - force_download: str, default=False - If False: if the file exists in output_path, code will will use the local version. - If True, the file will be downloaded, even if it exists in output_path. - - Returns - ------- - str - Name of the file downloaded. - - Examples - -------- - >>> url = 'https://springernature.figshare.com/ndownloader/files/18112775' - >>> output_path = '/path/to/directory' - >>> downloaded_file_name = download_from_figshare(url, output_path) - - """ - - import requests - import os - from tqdm import tqdm - - # force to use ipv4; my ubuntu machine is timing out when it first tries ipv6 - # requests.packages.urllib3.util.connection.HAS_IPV6 = False - - chunk_size = 512 - # check to make sure the url we are given is hosted by figshare.com - if not is_url(url, "figshare.com"): - raise Exception(f"{url} is not a valid figshare.com url") - - # get the head of the request - head = requests.head(url) - - # Because the url on figshare calls a downloader, instead of the direct file, - # we need to figure out where the original file is stored to know how big it is. - # Here we will parse the header info to get the file the downloader links to - # and then get the head info from this link to fetch the length. - # This is not actually necessary, but useful for updating the download status bar. - # We also fetch the name of the file from the header of the download link - - temp_url = head.headers["location"].split("?")[0] - name = head.headers["X-Filename"].split("/")[-1] - - # make sure we can handle a path with a ~ in it - output_path = os.path.expanduser(output_path) - - # We need to check to make sure that the file that is stored in the output path - # has the correct checksum, e.g., to avoid a case where we have a partially downloaded file - # or to make sure we don't have two files with the same name, but different content. - if os.path.isfile(f"{output_path}/{name}"): - calculated_checksum = calculate_md5_checksum( - file_name=name, file_path=output_path - ) - if calculated_checksum != md5_checksum: - force_download = True - logger.debug( - "Checksum of existing file does not match expected checksum, re-downloading." - ) - - if not os.path.isfile(f"{output_path}/{name}") or force_download: - logger.debug(f"Downloading datafile from figshare to {output_path}/{name}.") - - temp_url_headers = requests.head(temp_url) - - os.makedirs(output_path, exist_ok=True) - try: - length = int(temp_url_headers.headers["Content-Length"]) - except: - print( - "Could not determine the length of the file to download. The download bar will not be accurate." - ) - length = -1 - r = requests.get(url, stream=True) - - from modelforge.utils.misc import OpenWithLock - - with OpenWithLock(f"{output_path}/{name}.lockfile", "w") as fl: - with open(f"{output_path}/{name}", "wb") as fd: - # if we couldn't fetch the length from figshare, which seems to happen for some records - # we just don't know how long the tqdm bar will be. - if length == -1: - for chunk in tqdm( - r.iter_content(chunk_size=chunk_size), - ascii=True, - desc="downloading", - ): - fd.write(chunk) - else: - for chunk in tqdm( - r.iter_content(chunk_size=chunk_size), - ascii=True, - desc="downloading", - total=(int(length / chunk_size) + 1), - ): - fd.write(chunk) - os.remove(f"{output_path}/{name}.lockfile") - - calculated_checksum = calculate_md5_checksum( - file_name=name, file_path=output_path - ) - if calculated_checksum != md5_checksum: - raise Exception( - f"Checksum of downloaded file {calculated_checksum} does not match expected checksum {md5_checksum}" - ) - else: # if the file exists and we don't set force_download to True, just use the cached version - logger.debug(f"Datafile {name} already exists in {output_path}.") - logger.debug( - "Using previously downloaded file; set force_download=True to re-download." - ) - - return name - - -def download_from_zenodo( - url: str, md5_checksum: str, output_path: str, force_download=False -) -> str: - """ - Downloads a dataset from zenodo for a given url. - - If the datafile exists in the output_path, by default it will not be redownloaded. - - Parameters - ---------- - url : str, required - Direct link to datafile to download. - md5_checksum: str, required - Expected md5 checksum of the downloaded file. - output_path: str, required - Location to download the file to. - force_download: str, default=False - If False: if the file exists in output_path, code will will use the local version. - If True, the file will be downloaded, even if it exists in output_path. - - Returns - ------- - str - Name of the file downloaded. - - Examples - -------- - >>> url = "https://zenodo.org/records/3401581/files/PTC-CMC/atools_ml-v0.1.zip" - >>> output_path = '/path/to/directory' - >>> md5_checksum = "d41d8cd98f00b204e9800998ecf8427e" - >>> downloaded_file_name = download_from_zenodo(url, md5_checksum, output_path) - - """ - - import requests - import os - from tqdm import tqdm - - # force to use ipv4; my ubuntu machine is timing out when it first tries ipv6 - # requests.packages.urllib3.util.connection.HAS_IPV6 = False - - chunk_size = 512 - # check to make sure the url we are given is hosted by figshare.com - - if not is_url(url, "zenodo.org"): - raise Exception(f"{url} is not a valid zenodo.org url") - - # get the head of the request - head = requests.head(url) - - # Because the url on figshare calls a downloader, instead of the direct file, - # we need to figure out where the original file is stored to know how big it is. - # Here we will parse the header info to get the file the downloader links to - # and then get the head info from this link to fetch the length. - # This is not actually necessary, but useful for updating the download status bar. - # We also fetch the name of the file from the header of the download link - name = head.headers["Content-Disposition"].split("filename=")[-1] - length = int(head.headers["Content-Length"]) - - # make sure we can handle a path with a ~ in it - output_path = os.path.expanduser(output_path) - - # We need to check to make sure that the file that is stored in the output path - # has the correct checksum, e.g., to avoid a case where we have a partially downloaded file - # or to make sure we don't have two files with the same name, but different content. - - if os.path.isfile(f"{output_path}/{name}"): - calculated_checksum = calculate_md5_checksum( - file_name=name, file_path=output_path - ) - if calculated_checksum != md5_checksum: - force_download = True - logger.debug( - "Checksum of existing file does not match expected checksum, re-downloading." - ) - - if not os.path.isfile(f"{output_path}/{name}") or force_download: - logger.debug(f"Downloading datafile from zenodo to {output_path}/{name}.") - - r = requests.get(url, stream=True) - - os.makedirs(output_path, exist_ok=True) - - from modelforge.utils.misc import OpenWithLock - - with OpenWithLock(f"{output_path}/{name}.lockfile", "w") as fl: - with open(f"{output_path}/{name}", "wb") as fd: - for chunk in tqdm( - r.iter_content(chunk_size=chunk_size), - ascii=True, - desc="downloading", - total=(int(length / chunk_size) + 1), - ): - fd.write(chunk) - os.remove(f"{output_path}/{name}.lockfile") - - calculated_checksum = calculate_md5_checksum( - file_name=name, file_path=output_path - ) - if calculated_checksum != md5_checksum: - raise Exception( - f"Checksum of downloaded file {calculated_checksum} does not match expected checksum {md5_checksum}." - ) - - else: # if the file exists and we don't set force_download to True, just use the cached version - logger.debug(f"Datafile {name} already exists in {output_path}.") - logger.debug( - "Using previously downloaded file; set force_download=True to re-download." - ) - - return name