From 6749c41f2a3a898103cb4ab58e1f17c2e017b7b3 Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Sun, 1 Sep 2024 17:46:58 +0800 Subject: [PATCH 1/2] dev(narugo): add lazy download mode --- hfutils/index/fetch.py | 11 ++++++++++- hfutils/index/local_fetch.py | 11 ++++++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/hfutils/index/fetch.py b/hfutils/index/fetch.py index d62f259046..29e3811779 100644 --- a/hfutils/index/fetch.py +++ b/hfutils/index/fetch.py @@ -438,7 +438,7 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None, proxies: Optional[Dict] = None, user_agent: Union[Dict, str, None] = None, headers: Optional[Dict[str, str]] = None, endpoint: Optional[str] = None, - hf_token: Optional[str] = None): + force_download: bool = False, hf_token: Optional[str] = None): """ Download a file from a tar archive file in a Hugging Face repository. @@ -470,6 +470,8 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st :type headers: Dict[str, str], optional :param endpoint: The Hugging Face API endpoint. :type endpoint: str, optional + :param force_download: Force download the file to destination path. Defualt to `False`. + :type force_download: bool :param hf_token: The Hugging Face access token. :type hf_token: str, optional :raises FileNotFoundError: Raise this when file not exist in tar archive. @@ -533,6 +535,13 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st end_bytes = info['offset'] + info['size'] - 1 headers['Range'] = f'bytes={start_bytes}-{end_bytes}' + if not force_download and os.path.exists(local_file) and \ + os.path.isfile(local_file) and os.path.getsize(local_file) == info['size']: + _expected_sha256 = info.get('sha256') + if not _expected_sha256 or _f_sha256(local_file) == _expected_sha256: + # file already ready, no need to download it again + return + if os.path.dirname(local_file): os.makedirs(os.path.dirname(local_file), exist_ok=True) try: diff --git a/hfutils/index/local_fetch.py b/hfutils/index/local_fetch.py index 4d86fbb5b2..c86ea1c01b 100644 --- a/hfutils/index/local_fetch.py +++ b/hfutils/index/local_fetch.py @@ -171,7 +171,7 @@ def tar_file_size(archive_file: str, file_in_archive: str, idx_file: Optional[st def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, - idx_file: Optional[str] = None, chunk_size: int = 1 << 20): + idx_file: Optional[str] = None, chunk_size: int = 1 << 20, force_download: bool = False): """ Extract and download a specific file from the tar archive to a local file. @@ -190,6 +190,8 @@ def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, :type idx_file: Optional[str] :param chunk_size: The size of chunks to read and write, in bytes. Default is 1MB. :type chunk_size: int + :param force_download: Force download the file to destination path. Defualt to `False`. + :type force_download: bool :raises FileNotFoundError: If the specified file is not found in the archive. :raises ArchiveStandaloneFileIncompleteDownload: If the downloaded file size doesn't match the expected size. @@ -212,6 +214,13 @@ def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, info = files[_n_path(file_in_archive)] + if not force_download and os.path.exists(local_file) and \ + os.path.isfile(local_file) and os.path.getsize(local_file) == info['size']: + _expected_sha256 = info.get('sha256') + if not _expected_sha256 or _f_sha256(local_file) == _expected_sha256: + # file already ready, no need to download it again + return + if os.path.dirname(local_file): os.makedirs(os.path.dirname(local_file), exist_ok=True) try: From 9ee1475fdb421f6fb0965aa695528ed31fbd03da Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Sun, 1 Sep 2024 18:13:24 +0800 Subject: [PATCH 2/2] dev(narugo): add unittest --- hfutils/index/fetch.py | 4 ++- hfutils/index/local_fetch.py | 4 ++- test/index/test_fetch.py | 48 ++++++++++++++++++++++++++++++++++ test/index/test_local_fetch.py | 44 +++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 2 deletions(-) diff --git a/hfutils/index/fetch.py b/hfutils/index/fetch.py index 29e3811779..50f01c091c 100644 --- a/hfutils/index/fetch.py +++ b/hfutils/index/fetch.py @@ -470,7 +470,9 @@ def hf_tar_file_download(repo_id: str, archive_in_repo: str, file_in_archive: st :type headers: Dict[str, str], optional :param endpoint: The Hugging Face API endpoint. :type endpoint: str, optional - :param force_download: Force download the file to destination path. Defualt to `False`. + :param force_download: Force download the file to destination path. + Defualt to `False`, downloading will be skipped if the local file + is fully matched with expected file. :type force_download: bool :param hf_token: The Hugging Face access token. :type hf_token: str, optional diff --git a/hfutils/index/local_fetch.py b/hfutils/index/local_fetch.py index c86ea1c01b..4343da7c13 100644 --- a/hfutils/index/local_fetch.py +++ b/hfutils/index/local_fetch.py @@ -190,7 +190,9 @@ def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, :type idx_file: Optional[str] :param chunk_size: The size of chunks to read and write, in bytes. Default is 1MB. :type chunk_size: int - :param force_download: Force download the file to destination path. Defualt to `False`. + :param force_download: Force download the file to destination path. + Defualt to `False`, downloading will be skipped if the local file + is fully matched with expected file. :type force_download: bool :raises FileNotFoundError: If the specified file is not found in the archive. diff --git a/test/index/test_fetch.py b/test/index/test_fetch.py index 11ebbdfc9d..6581ef64bd 100644 --- a/test/index/test_fetch.py +++ b/test/index/test_fetch.py @@ -114,6 +114,30 @@ def test_hf_tar_file_download_small(self): ) file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json') + def test_hf_tar_file_download_small_exist(self): + with isolated_directory({ + '.meta.json': get_testfile('skin_mashu', '.meta.json') + }): + hf_tar_file_download( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins.tar', + file_in_archive='.meta.json', + local_file='.meta.json' + ) + file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json') + + def test_hf_tar_file_download_small_replace(self): + with isolated_directory({ + '.meta.json': get_testfile('skin_mashu', '愚人节_奥特瑙斯.png') + }): + hf_tar_file_download( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins.tar', + file_in_archive='.meta.json', + local_file='.meta.json' + ) + file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json') + def test_hf_tar_file_download_lfs(self): with isolated_directory(): hf_tar_file_download( @@ -124,6 +148,30 @@ def test_hf_tar_file_download_lfs(self): ) file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png') + def test_hf_tar_file_download_lfs_exist(self): + with isolated_directory({ + '愚人节_奥特瑙斯.png': get_testfile('skin_mashu', '愚人节_奥特瑙斯.png') + }): + hf_tar_file_download( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins.tar', + file_in_archive='./愚人节_奥特瑙斯.png', + local_file='愚人节_奥特瑙斯.png' + ) + file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png') + + def test_hf_tar_file_download_lfs_replace(self): + with isolated_directory({ + '愚人节_奥特瑙斯.png': get_testfile('skin_mashu', '.meta.json') + }): + hf_tar_file_download( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins.tar', + file_in_archive='./愚人节_奥特瑙斯.png', + local_file='愚人节_奥特瑙斯.png' + ) + file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png') + def test_hf_tar_file_download_not_found(self): with isolated_directory(), pytest.raises(FileNotFoundError): hf_tar_file_download( diff --git a/test/index/test_local_fetch.py b/test/index/test_local_fetch.py index 92f12a07d4..45dc1e4603 100644 --- a/test/index/test_local_fetch.py +++ b/test/index/test_local_fetch.py @@ -100,6 +100,28 @@ def test_tar_file_download_small(self, local_narugo_test_cos5t_tars): ) file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json') + def test_tar_file_download_small_exist(self, local_narugo_test_cos5t_tars): + with isolated_directory({ + '.meta.json': get_testfile('skin_mashu', '.meta.json') + }): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json', + local_file='.meta.json' + ) + file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json') + + def test_tar_file_download_small_replace(self, local_narugo_test_cos5t_tars): + with isolated_directory({ + '.meta.json': get_testfile('skin_mashu', '愚人节_奥特瑙斯.png') + }): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json', + local_file='.meta.json' + ) + file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json') + def test_tar_file_download_lfs(self, local_narugo_test_cos5t_tars): with isolated_directory(): tar_file_download( @@ -109,6 +131,28 @@ def test_tar_file_download_lfs(self, local_narugo_test_cos5t_tars): ) file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png') + def test_tar_file_download_lfs_exist(self, local_narugo_test_cos5t_tars): + with isolated_directory({ + '愚人节_奥特瑙斯.png': get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), + }): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png', + local_file='愚人节_奥特瑙斯.png', + ) + file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png') + + def test_tar_file_download_lfs_replace(self, local_narugo_test_cos5t_tars): + with isolated_directory({ + '愚人节_奥特瑙斯.png': get_testfile('skin_mashu', '.meta.json'), + }): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png', + local_file='愚人节_奥特瑙斯.png', + ) + file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png') + def test_tar_file_download_not_found(self, local_narugo_test_cos5t_tars): with isolated_directory(), pytest.raises(FileNotFoundError): tar_file_download(