From 93262942ebf4ade514e27b79c9b12ce44c27b8d3 Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Fri, 2 Aug 2024 05:59:10 +0800 Subject: [PATCH] dev(narugo): save local index code --- hfutils/index/__init__.py | 1 + hfutils/index/fetch.py | 4 +- hfutils/index/local_fetch.py | 96 +++++++++++++++++++++++ test/index/conftest.py | 15 ++++ test/index/test_local_fetch.py | 136 +++++++++++++++++++++++++++++++++ 5 files changed, 250 insertions(+), 2 deletions(-) create mode 100644 hfutils/index/local_fetch.py create mode 100644 test/index/conftest.py create mode 100644 test/index/test_local_fetch.py diff --git a/hfutils/index/__init__.py b/hfutils/index/__init__.py index 7631e21a3e..c01f5c74af 100644 --- a/hfutils/index/__init__.py +++ b/hfutils/index/__init__.py @@ -1,4 +1,5 @@ from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \ ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch, hf_tar_file_size, hf_tar_file_info +from .local_fetch import tar_get_index, tar_file_info, tar_file_download, tar_file_size, tar_file_exists, tar_list_files from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory from .validate import hf_tar_item_validate, hf_tar_validate diff --git a/hfutils/index/fetch.py b/hfutils/index/fetch.py index a33f95f3d8..f15b3f1a31 100644 --- a/hfutils/index/fetch.py +++ b/hfutils/index/fetch.py @@ -1,6 +1,6 @@ import json import os.path -from typing import Optional, Dict, Union +from typing import Optional, Dict, Union, List from huggingface_hub.file_download import http_get, hf_hub_url from huggingface_hub.utils import build_hf_headers @@ -96,7 +96,7 @@ def hf_tar_list_files(repo_id: str, archive_in_repo: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None, idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None, - hf_token: Optional[str] = None): + hf_token: Optional[str] = None) -> List[str]: """ List files inside a tar archive file in a Hugging Face repository. diff --git a/hfutils/index/local_fetch.py b/hfutils/index/local_fetch.py new file mode 100644 index 0000000000..31fb7c239f --- /dev/null +++ b/hfutils/index/local_fetch.py @@ -0,0 +1,96 @@ +import json +import os +from typing import Optional, List + + +def tar_get_index(archive_file: str, idx_file: Optional[str] = None): + body, _ = os.path.splitext(archive_file) + default_index_file = f'{body}.json' + with open(idx_file or default_index_file, 'r') as f: + return json.load(f) + + +def tar_list_files(archive_file: str, idx_file: Optional[str] = None) -> List[str]: + index_data = tar_get_index( + archive_file=archive_file, + idx_file=idx_file, + ) + return list(index_data['files'].keys()) + + +def tar_file_exists(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> bool: + from .fetch import _hf_files_process, _n_path + index = tar_get_index( + archive_file=archive_file, + idx_file=idx_file, + ) + files = _hf_files_process(index['files']) + return _n_path(file_in_archive) in files + + +def tar_file_info(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> dict: + from .fetch import _hf_files_process, _n_path + index = tar_get_index( + archive_file=archive_file, + idx_file=idx_file, + ) + files = _hf_files_process(index['files']) + if _n_path(file_in_archive) not in files: + raise FileNotFoundError(f'File {file_in_archive!r} not found ' + f'in local archive {archive_file!r}.') + else: + return files[_n_path(file_in_archive)] + + +def tar_file_size(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> int: + return tar_file_info( + archive_file=archive_file, + file_in_archive=file_in_archive, + idx_file=idx_file, + )['size'] + + +def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, + idx_file: Optional[str] = None, chunk_size: int = 1 << 20): + from .fetch import _hf_files_process, _n_path, _f_sha256, \ + ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch + + index = tar_get_index( + archive_file=archive_file, + idx_file=idx_file, + ) + files = _hf_files_process(index['files']) + if _n_path(file_in_archive) not in files: + raise FileNotFoundError(f'File {file_in_archive!r} not found ' + f'in local archive {archive_file!r}.') + + info = files[_n_path(file_in_archive)] + + if os.path.dirname(local_file): + os.makedirs(os.path.dirname(local_file), exist_ok=True) + try: + with open(local_file, 'wb') as wf: + if info['size'] > 0: + with open(archive_file, 'rb') as rf: + rf.seek(info['offset']) + tp = info['offset'] + info['size'] + while rf.tell() < tp: + read_bytes = min(tp - rf.tell(), chunk_size) + wf.write(rf.read(read_bytes)) + + if os.path.getsize(local_file) != info['size']: + raise ArchiveStandaloneFileIncompleteDownload( + f'Expected size is {info["size"]}, but actually {os.path.getsize(local_file)} downloaded.' + ) + + if info.get('sha256'): + _sha256 = _f_sha256(local_file) + if _sha256 != info['sha256']: + raise ArchiveStandaloneFileHashNotMatch( + f'Expected hash is {info["sha256"]!r}, but actually {_sha256!r} found.' + ) + + except Exception: + if os.path.exists(local_file): + os.remove(local_file) + raise diff --git a/test/index/conftest.py b/test/index/conftest.py new file mode 100644 index 0000000000..ee7265679d --- /dev/null +++ b/test/index/conftest.py @@ -0,0 +1,15 @@ +import pytest +from hbutils.system import TemporaryDirectory + +from hfutils.operate import download_directory_as_directory + + +@pytest.fixture(scope='module') +def local_narugo_test_cos5t_tars(): + with TemporaryDirectory() as td: + download_directory_as_directory( + repo_id='narugo/test_cos5t_tars', + repo_type='dataset', + local_directory=td, + ) + yield td diff --git a/test/index/test_local_fetch.py b/test/index/test_local_fetch.py new file mode 100644 index 0000000000..92f12a07d4 --- /dev/null +++ b/test/index/test_local_fetch.py @@ -0,0 +1,136 @@ +import os.path + +import pytest +from hbutils.testing import isolated_directory +from natsort import natsorted + +from hfutils.index import tar_list_files, tar_file_exists, tar_file_download, tar_file_info, \ + tar_file_size +from test.testings import get_testfile, file_compare + + +@pytest.mark.unittest +class TestIndexLocalFetch: + def test_tar_list_files(self, local_narugo_test_cos5t_tars): + files = tar_list_files( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + ) + assert len(files) == 17 + assert natsorted(files) == [ + '.meta.json', 'Bright_Voyager.png', 'Grail_League_1星.png', 'Grail_League_2星.png', 'Grail_League_3星.png', + 'Grail_League_4星.png', 'Grail_League_5星.png', '奥特瑙斯.png', '奥特瑙斯_改建型.png', '常夏的泳装.png', + '常夏的泳装Ver_02.png', '愚人节.png', '愚人节_奥特瑙斯.png', '第1阶段.png', '第2阶段.png', '第3阶段.png', + '第4阶段.png' + ] + + def test_tar_file_exists(self, local_narugo_test_cos5t_tars): + assert tar_file_exists( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json' + ) + assert tar_file_exists( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节_奥特瑙斯.png' + ) + assert tar_file_exists( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png' + ) + assert not tar_file_exists( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节奥特瑙斯.png' + ) + + def test_tar_file_info(self, local_narugo_test_cos5t_tars): + assert tar_file_info( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json' + ) == { + 'offset': 2725376, + 'sha256': '4585b01c251a496b73cb231d29fc711cfb1d682a84334d95f6f5b6c1cc5b5222', + 'size': 8968 + } + assert tar_file_info( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节_奥特瑙斯.png' + ) == { + 'offset': 3954176, + 'sha256': '991497fa586f6f4529827e0f8f1f228c20ec9fb507c314ee9d20d47c46f26e89', + 'size': 255276 + } + assert tar_file_info( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png' + ) == { + 'offset': 3954176, + 'sha256': '991497fa586f6f4529827e0f8f1f228c20ec9fb507c314ee9d20d47c46f26e89', + 'size': 255276 + } + with pytest.raises(FileNotFoundError): + _ = tar_file_info( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节奥特瑙斯.png' + ) + + def test_tar_file_size(self, local_narugo_test_cos5t_tars): + assert tar_file_size( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json' + ) == 8968 + assert tar_file_size( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节_奥特瑙斯.png' + ) == 255276 + assert tar_file_size( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png' + ) == 255276 + with pytest.raises(FileNotFoundError): + _ = tar_file_size( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='愚人节奥特瑙斯.png' + ) + + def test_tar_file_download_small(self, local_narugo_test_cos5t_tars): + with isolated_directory(): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='.meta.json', + local_file='.meta.json' + ) + file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json') + + def test_tar_file_download_lfs(self, local_narugo_test_cos5t_tars): + with isolated_directory(): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节_奥特瑙斯.png', + local_file='愚人节_奥特瑙斯.png' + ) + file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png') + + def test_tar_file_download_not_found(self, local_narugo_test_cos5t_tars): + with isolated_directory(), pytest.raises(FileNotFoundError): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'), + file_in_archive='./愚人节奥特瑙斯.png', + local_file='愚人节_奥特瑙斯.png' + ) + + def test_tar_file_download_subdir(self, local_narugo_test_cos5t_tars): + with isolated_directory(): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'ex3.tar'), + file_in_archive='artoria_caster_third_ascension_fate/sankaku_21305298.jpg', + local_file='f/ac.jpg' + ) + file_compare(get_testfile('sankaku_21305298.jpg'), 'f/ac.jpg') + + def test_tar_file_download_empty(self, local_narugo_test_cos5t_tars): + with isolated_directory(): + tar_file_download( + archive_file=os.path.join(local_narugo_test_cos5t_tars, 'empty_file.tar'), + file_in_archive='empty_file', + local_file='empty_file', + ) + assert os.path.getsize('empty_file') == 0