diff --git a/hfutils/index/fetch.py b/hfutils/index/fetch.py index d62f259046..29e3811779 100644 --- a/hfutils/index/fetch.py +++ b/hfutils/index/fetch.py @@ -438,7 +438,7 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None, proxies: Optional[Dict] = None, user_agent: Union[Dict, str, None] = None, headers: Optional[Dict[str, str]] = None, endpoint: Optional[str] = None, - hf_token: Optional[str] = None): + force_download: bool = False, hf_token: Optional[str] = None): """ Download a file from a tar archive file in a Hugging Face repository. @@ -470,6 +470,8 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st :type headers: Dict[str, str], optional :param endpoint: The Hugging Face API endpoint. :type endpoint: str, optional + :param force_download: Force download the file to destination path. Defualt to `False`. + :type force_download: bool :param hf_token: The Hugging Face access token. :type hf_token: str, optional :raises FileNotFoundError: Raise this when file not exist in tar archive. @@ -533,6 +535,13 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st end_bytes = info['offset'] + info['size'] - 1 headers['Range'] = f'bytes={start_bytes}-{end_bytes}' + if not force_download and os.path.exists(local_file) and \ + os.path.isfile(local_file) and os.path.getsize(local_file) == info['size']: + _expected_sha256 = info.get('sha256') + if not _expected_sha256 or _f_sha256(local_file) == _expected_sha256: + # file already ready, no need to download it again + return + if os.path.dirname(local_file): os.makedirs(os.path.dirname(local_file), exist_ok=True) try: diff --git a/hfutils/index/local_fetch.py b/hfutils/index/local_fetch.py index 4d86fbb5b2..c86ea1c01b 100644 --- a/hfutils/index/local_fetch.py +++ b/hfutils/index/local_fetch.py @@ -171,7 +171,7 @@ def tar_file_size(archive_file: str, file_in_archive: str, idx_file: Optional[st def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, - idx_file: Optional[str] = None, chunk_size: int = 1 << 20): + idx_file: Optional[str] = None, chunk_size: int = 1 << 20, force_download: bool = False): """ Extract and download a specific file from the tar archive to a local file. @@ -190,6 +190,8 @@ def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, :type idx_file: Optional[str] :param chunk_size: The size of chunks to read and write, in bytes. Default is 1MB. :type chunk_size: int + :param force_download: Force download the file to destination path. Defualt to `False`. + :type force_download: bool :raises FileNotFoundError: If the specified file is not found in the archive. :raises ArchiveStandaloneFileIncompleteDownload: If the downloaded file size doesn't match the expected size. @@ -212,6 +214,13 @@ def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, info = files[_n_path(file_in_archive)] + if not force_download and os.path.exists(local_file) and \ + os.path.isfile(local_file) and os.path.getsize(local_file) == info['size']: + _expected_sha256 = info.get('sha256') + if not _expected_sha256 or _f_sha256(local_file) == _expected_sha256: + # file already ready, no need to download it again + return + if os.path.dirname(local_file): os.makedirs(os.path.dirname(local_file), exist_ok=True) try: