From ab49868f1fe544685622ce5b775776f383e8241a Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Sun, 21 Apr 2024 15:13:48 +0800 Subject: [PATCH 1/3] dev(narugo): add more tar --- test/index/test_make.py | 52 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) diff --git a/test/index/test_make.py b/test/index/test_make.py index cd6550146c..341c3304f3 100644 --- a/test/index/test_make.py +++ b/test/index/test_make.py @@ -1,4 +1,5 @@ import json +import os.path import pytest from hbutils.testing import isolated_directory @@ -64,3 +65,54 @@ def test_tar_create_index(self, raw_tar): 'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1', 'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a' } + + def test_tar_create_index_subdir(self, raw_tar): + with isolated_directory({os.path.join('subdir', 'raw.tar'): raw_tar}): + tar_create_index(os.path.join('subdir', 'raw.tar')) + with open(os.path.join('subdir', 'raw.json'), 'r') as f: + assert json.load(f) == { + 'files': { + '1.txt': { + 'offset': 3584, + 'sha256': '57a67d463dde06dcf3bf3bd8382ebf5c8d6e0a854135914e215f09fc0e1080b9', + 'size': 13 + }, + 'README.md': { + 'offset': 1536, + 'sha256': '75fae9f83087725e606ed7bf243a6655b1ddf583919529b3291980322b62af77', + 'size': 51 + }, + 'subdir/script.py': { + 'offset': 5632, + 'sha256': '5c3086e72529e59e42002f11bbfabc40b084981daedb1a3d4a31623122fd8867', + 'size': 33 + } + }, + 'filesize': 10240, + 'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1', + 'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a' + } + + def test_tar_create_index_subdir_no_hash(self, raw_tar): + with isolated_directory({os.path.join('subdir', 'raw.tar'): raw_tar}): + tar_create_index(os.path.join('subdir', 'raw.tar'), with_hash=False) + with open(os.path.join('subdir', 'raw.json'), 'r') as f: + assert json.load(f) == { + 'files': { + '1.txt': { + 'offset': 3584, + 'size': 13 + }, + 'README.md': { + 'offset': 1536, + 'size': 51 + }, + 'subdir/script.py': { + 'offset': 5632, + 'size': 33 + } + }, + 'filesize': 10240, + 'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1', + 'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a' + } From 2328692daf7a82aba6af55455c472e3247cd089b Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Sun, 21 Apr 2024 16:28:22 +0800 Subject: [PATCH 2/3] dev(narugo): add index sync --- docs/source/api_doc/index/index.rst | 1 + docs/source/api_doc/index/validate.rst | 22 +++++ hfutils/index/__init__.py | 1 + hfutils/index/validate.py | 107 +++++++++++++++++++++++++ test/index/test_validate.py | 34 ++++++++ 5 files changed, 165 insertions(+) create mode 100644 docs/source/api_doc/index/validate.rst create mode 100644 hfutils/index/validate.py create mode 100644 test/index/test_validate.py diff --git a/docs/source/api_doc/index/index.rst b/docs/source/api_doc/index/index.rst index a8eb079af0..b1fb93080b 100644 --- a/docs/source/api_doc/index/index.rst +++ b/docs/source/api_doc/index/index.rst @@ -11,4 +11,5 @@ hfutils.index fetch make + validate diff --git a/docs/source/api_doc/index/validate.rst b/docs/source/api_doc/index/validate.rst new file mode 100644 index 0000000000..fdac3cd959 --- /dev/null +++ b/docs/source/api_doc/index/validate.rst @@ -0,0 +1,22 @@ +hfutils.index.validate +================================ + +.. currentmodule:: hfutils.index.validate + +.. automodule:: hfutils.index.validate + + +hf_tar_item_validate +-------------------------------------- + +.. autofunction:: hf_tar_item_validate + + + +hf_tar_validate +-------------------------------------- + +.. autofunction:: hf_tar_validate + + + diff --git a/hfutils/index/__init__.py b/hfutils/index/__init__.py index 9ca0de4115..fa11495efb 100644 --- a/hfutils/index/__init__.py +++ b/hfutils/index/__init__.py @@ -1,3 +1,4 @@ from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \ ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory +from .validate import hf_tar_item_validate, hf_tar_validate diff --git a/hfutils/index/validate.py b/hfutils/index/validate.py new file mode 100644 index 0000000000..c330b0a3bf --- /dev/null +++ b/hfutils/index/validate.py @@ -0,0 +1,107 @@ +from typing import Optional + +from huggingface_hub.hf_api import RepoFile +from huggingface_hub.utils import EntryNotFoundError + +from .fetch import hf_tar_get_index +from ..operate.base import RepoTypeTyping, get_hf_client + + +def hf_tar_item_validate(file_item: RepoFile, size: int, hash_: Optional[str] = None, hash_lfs: Optional[str] = None): + """ + Validate a file item in a tar archive. + + This function checks if the file item matches the expected size and hash. + + :param file_item: The file item from the Hugging Face repository. + :type file_item: RepoFile + :param size: The expected size of the file. + :type size: int + :param hash_: The expected SHA-1 hash of the file. + :type hash_: str, optional + :param hash_lfs: The expected SHA-256 hash of the file if stored in LFS. + :type hash_lfs: str, optional + :return: True if the file item is valid, False otherwise. + :rtype: bool + """ + # size not match + if (file_item.lfs and size != file_item.lfs.size) or \ + (not file_item.lfs and size != file_item.size): + return False + + # compare tar file hash + item_hashes = [file_item.blob_id] + if file_item.lfs: + item_hashes.append(file_item.lfs.sha256) + item_hashes = set(filter(bool, item_hashes)) + cmp_hashes = [hash_, hash_lfs] + cmp_hashes = set(filter(bool, cmp_hashes)) + return bool(cmp_hashes & item_hashes) + + +def hf_tar_validate(repo_id: str, archive_in_repo: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', + idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None, + idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None, + hf_token: Optional[str] = None): + """ + Validate a tar archive in a Hugging Face repository. + + This function validates if the tar archive in the Hugging Face repository matches the expected size and hash. + + :param repo_id: The ID of the Hugging Face repository. + :type repo_id: str + :param archive_in_repo: The path to the tar archive in the repository. + :type archive_in_repo: str + :param repo_type: The type of the Hugging Face repository, defaults to 'dataset'. + :type repo_type: RepoTypeTyping, optional + :param revision: The revision of the repository, defaults to 'main'. + :type revision: str, optional + :param idx_repo_id: The ID of the repository where the index file is stored. + :type idx_repo_id: Optional[str], optional + :param idx_file_in_repo: The path to the index file in the repository. + :type idx_file_in_repo: Optional[str], optional + :param idx_repo_type: The type of the repository where the index file is stored. + :type idx_repo_type: Optional[RepoTypeTyping], optional + :param idx_revision: The revision of the repository where the index file is stored. + :type idx_revision: Optional[str], optional + :param hf_token: The Hugging Face token for authentication, defaults to None. + :type hf_token: Optional[str], optional + :raises EntryNotFoundError: If the specified entry is not found in the repository. + :raises IsADirectoryError: If the specified entry is a directory. + :return: True if the tar archive is valid, False otherwise. + :rtype: bool + """ + hf_client = get_hf_client(hf_token) + + items = list(hf_client.get_paths_info( + repo_id=repo_id, + repo_type=repo_type, + paths=[archive_in_repo], + revision=revision, + )) + if len(items) == 0: + raise EntryNotFoundError(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} not found.') + elif not isinstance(items[0], RepoFile): + raise IsADirectoryError(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} is a directory, not a file.') + else: + item = items[0] + + index = hf_tar_get_index( + repo_id=repo_id, + archive_in_repo=archive_in_repo, + repo_type=repo_type, + revision=revision, + + idx_repo_id=idx_repo_id, + idx_file_in_repo=idx_file_in_repo, + idx_repo_type=idx_repo_type, + idx_revision=idx_revision, + + hf_token=hf_token, + ) + return hf_tar_item_validate( + file_item=item, + size=index['filesize'], + hash_=index.get('hash'), + hash_lfs=index.get('hash_lfs'), + ) diff --git a/test/index/test_validate.py b/test/index/test_validate.py new file mode 100644 index 0000000000..456c5ee88f --- /dev/null +++ b/test/index/test_validate.py @@ -0,0 +1,34 @@ +import pytest +from huggingface_hub.utils import EntryNotFoundError + +from hfutils.index import hf_tar_validate + + +@pytest.mark.unittest +class TestIndexValidate: + def test_hf_tar_file_download_lfs(self): + assert hf_tar_validate( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins.tar', + ) + + def test_hf_tar_file_download_lfs_extra(self): + assert not hf_tar_validate( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins.tar', + idx_file_in_repo='ex3.json' + ) + + def test_hf_tar_file_download_lfs_not_found(self): + with pytest.raises(EntryNotFoundError): + hf_tar_validate( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins_not_found.tar', + ) + + def test_hf_tar_file_download_lfs_is_directory(self): + with pytest.raises(IsADirectoryError): + hf_tar_validate( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='1001-1500', + ) From 8ebb7fc2e7f54bb315d249e3cd5344334175cafa Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Sun, 21 Apr 2024 16:47:35 +0800 Subject: [PATCH 3/3] dev(narugo): add index validate --- hfutils/index/make.py | 48 +++++++++++++++++++++++++++---------- hfutils/index/validate.py | 28 ++++++++++++---------- test/index/test_validate.py | 14 +++++++++++ 3 files changed, 65 insertions(+), 25 deletions(-) diff --git a/hfutils/index/make.py b/hfutils/index/make.py index 92962f8cf8..46a073df4c 100644 --- a/hfutils/index/make.py +++ b/hfutils/index/make.py @@ -92,25 +92,27 @@ def tar_create_index(src_tar_file, dst_index_file: Optional[str] = None, return dst_index_file -def hf_tar_create_index(repo_id: str, filename: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', - idx_repo_id: Optional[str] = None, idx_filename: Optional[str] = None, +def hf_tar_create_index(repo_id: str, archive_in_repo: str, + repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', + idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None, idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None, - chunk_for_hash: int = 1 << 20, with_hash: bool = True, hf_token: Optional[str] = None): + chunk_for_hash: int = 1 << 20, with_hash: bool = True, skip_when_synced: bool = True, + hf_token: Optional[str] = None, ): """ Create an index file for a tar archive file in a Hugging Face repository. :param repo_id: The identifier of the repository. :type repo_id: str - :param filename: The path to the tar archive file. - :type filename: str + :param archive_in_repo: The path to the tar archive file. + :type archive_in_repo: str :param repo_type: The type of the Hugging Face repository, defaults to 'dataset'. :type repo_type: RepoTypeTyping, optional :param revision: The revision of the repository, defaults to 'main'. :type revision: str, optional :param idx_repo_id: The identifier of the index repository, defaults to None. :type idx_repo_id: str, optional - :param idx_filename: The path to save the index file in the index repository, defaults to None. - :type idx_filename: str, optional + :param idx_file_in_repo: The path to save the index file in the index repository, defaults to None. + :type idx_file_in_repo: str, optional :param idx_repo_type: The type of the index repository, defaults to None. :type idx_repo_type: RepoTypeTyping, optional :param idx_revision: The revision of the index repository, defaults to None. @@ -119,31 +121,51 @@ def hf_tar_create_index(repo_id: str, filename: str, repo_type: RepoTypeTyping = :type chunk_for_hash: int, optional :param with_hash: Whether to include file hashes in the index, defaults to True. :type with_hash: bool, optional + :param skip_when_synced: Skip syncing when index is ready, defaults to True. + :type skip_when_synced: bool :param hf_token: The Hugging Face access token, defaults to None. :type hf_token: str, optional """ + body, _ = os.path.splitext(archive_in_repo) + default_index_filename = f'{body}.json' + + from .validate import hf_tar_validate + if skip_when_synced and hf_tar_validate( + repo_id=repo_id, + repo_type=repo_type, + archive_in_repo=archive_in_repo, + revision=revision, + + idx_repo_id=idx_repo_id or repo_id, + idx_repo_type=idx_repo_type or repo_type, + idx_file_in_repo=idx_file_in_repo or default_index_filename, + idx_revision=idx_revision or revision, + + hf_token=hf_token, + ): + logging.info(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} already indexed, skipped.') + return + with TemporaryDirectory() as td: - local_tar_file = os.path.join(td, os.path.basename(filename)) + local_tar_file = os.path.join(td, os.path.basename(archive_in_repo)) download_file_to_file( repo_id=repo_id, repo_type=repo_type, - file_in_repo=filename, + file_in_repo=archive_in_repo, local_file=local_tar_file, revision=revision, hf_token=hf_token, ) dst_index_file = tar_create_index(local_tar_file, chunk_for_hash=chunk_for_hash, with_hash=with_hash) - body, _ = os.path.splitext(filename) - default_index_filename = f'{body}.json' upload_file_to_file( repo_id=idx_repo_id or repo_id, repo_type=idx_repo_type or repo_type, - file_in_repo=idx_filename or default_index_filename, + file_in_repo=idx_file_in_repo or default_index_filename, local_file=dst_index_file, revision=idx_revision or revision, hf_token=hf_token, - message=f'Create index for {repo_type}s/{repo_id}@{revision}/{filename}', + message=f'Create index for {repo_type}s/{repo_id}@{revision}/{archive_in_repo}', ) diff --git a/hfutils/index/validate.py b/hfutils/index/validate.py index c330b0a3bf..a9c39aaac0 100644 --- a/hfutils/index/validate.py +++ b/hfutils/index/validate.py @@ -1,7 +1,7 @@ from typing import Optional from huggingface_hub.hf_api import RepoFile -from huggingface_hub.utils import EntryNotFoundError +from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError from .fetch import hf_tar_get_index from ..operate.base import RepoTypeTyping, get_hf_client @@ -86,19 +86,23 @@ def hf_tar_validate(repo_id: str, archive_in_repo: str, repo_type: RepoTypeTypin else: item = items[0] - index = hf_tar_get_index( - repo_id=repo_id, - archive_in_repo=archive_in_repo, - repo_type=repo_type, - revision=revision, + try: + index = hf_tar_get_index( + repo_id=repo_id, + archive_in_repo=archive_in_repo, + repo_type=repo_type, + revision=revision, - idx_repo_id=idx_repo_id, - idx_file_in_repo=idx_file_in_repo, - idx_repo_type=idx_repo_type, - idx_revision=idx_revision, + idx_repo_id=idx_repo_id, + idx_file_in_repo=idx_file_in_repo, + idx_repo_type=idx_repo_type, + idx_revision=idx_revision, + + hf_token=hf_token, + ) + except (EntryNotFoundError, RepositoryNotFoundError): + return False - hf_token=hf_token, - ) return hf_tar_item_validate( file_item=item, size=index['filesize'], diff --git a/test/index/test_validate.py b/test/index/test_validate.py index 456c5ee88f..1481f908f2 100644 --- a/test/index/test_validate.py +++ b/test/index/test_validate.py @@ -26,6 +26,20 @@ def test_hf_tar_file_download_lfs_not_found(self): archive_in_repo='mashu_skins_not_found.tar', ) + def test_hf_tar_file_download_lfs_not_found_idx_repo(self): + assert not hf_tar_validate( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins.tar', + idx_repo_id='narugo/repo_not_found' + ) + + def test_hf_tar_file_download_lfs_not_found_idx_index(self): + assert not hf_tar_validate( + repo_id='narugo/test_cos5t_tars', + archive_in_repo='mashu_skins.tar', + idx_file_in_repo='mashu_skins_not_found.json', + ) + def test_hf_tar_file_download_lfs_is_directory(self): with pytest.raises(IsADirectoryError): hf_tar_validate(