-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #12 from deepghs/dev/tar
dev(narugo): add more tar tests
- Loading branch information
Showing
7 changed files
with
270 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,4 +11,5 @@ hfutils.index | |
|
||
fetch | ||
make | ||
validate | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
hfutils.index.validate | ||
================================ | ||
|
||
.. currentmodule:: hfutils.index.validate | ||
|
||
.. automodule:: hfutils.index.validate | ||
|
||
|
||
hf_tar_item_validate | ||
-------------------------------------- | ||
|
||
.. autofunction:: hf_tar_item_validate | ||
|
||
|
||
|
||
hf_tar_validate | ||
-------------------------------------- | ||
|
||
.. autofunction:: hf_tar_validate | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \ | ||
ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch | ||
from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory | ||
from .validate import hf_tar_item_validate, hf_tar_validate |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
from typing import Optional | ||
|
||
from huggingface_hub.hf_api import RepoFile | ||
from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError | ||
|
||
from .fetch import hf_tar_get_index | ||
from ..operate.base import RepoTypeTyping, get_hf_client | ||
|
||
|
||
def hf_tar_item_validate(file_item: RepoFile, size: int, hash_: Optional[str] = None, hash_lfs: Optional[str] = None): | ||
""" | ||
Validate a file item in a tar archive. | ||
This function checks if the file item matches the expected size and hash. | ||
:param file_item: The file item from the Hugging Face repository. | ||
:type file_item: RepoFile | ||
:param size: The expected size of the file. | ||
:type size: int | ||
:param hash_: The expected SHA-1 hash of the file. | ||
:type hash_: str, optional | ||
:param hash_lfs: The expected SHA-256 hash of the file if stored in LFS. | ||
:type hash_lfs: str, optional | ||
:return: True if the file item is valid, False otherwise. | ||
:rtype: bool | ||
""" | ||
# size not match | ||
if (file_item.lfs and size != file_item.lfs.size) or \ | ||
(not file_item.lfs and size != file_item.size): | ||
return False | ||
|
||
# compare tar file hash | ||
item_hashes = [file_item.blob_id] | ||
if file_item.lfs: | ||
item_hashes.append(file_item.lfs.sha256) | ||
item_hashes = set(filter(bool, item_hashes)) | ||
cmp_hashes = [hash_, hash_lfs] | ||
cmp_hashes = set(filter(bool, cmp_hashes)) | ||
return bool(cmp_hashes & item_hashes) | ||
|
||
|
||
def hf_tar_validate(repo_id: str, archive_in_repo: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', | ||
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None, | ||
idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None, | ||
hf_token: Optional[str] = None): | ||
""" | ||
Validate a tar archive in a Hugging Face repository. | ||
This function validates if the tar archive in the Hugging Face repository matches the expected size and hash. | ||
:param repo_id: The ID of the Hugging Face repository. | ||
:type repo_id: str | ||
:param archive_in_repo: The path to the tar archive in the repository. | ||
:type archive_in_repo: str | ||
:param repo_type: The type of the Hugging Face repository, defaults to 'dataset'. | ||
:type repo_type: RepoTypeTyping, optional | ||
:param revision: The revision of the repository, defaults to 'main'. | ||
:type revision: str, optional | ||
:param idx_repo_id: The ID of the repository where the index file is stored. | ||
:type idx_repo_id: Optional[str], optional | ||
:param idx_file_in_repo: The path to the index file in the repository. | ||
:type idx_file_in_repo: Optional[str], optional | ||
:param idx_repo_type: The type of the repository where the index file is stored. | ||
:type idx_repo_type: Optional[RepoTypeTyping], optional | ||
:param idx_revision: The revision of the repository where the index file is stored. | ||
:type idx_revision: Optional[str], optional | ||
:param hf_token: The Hugging Face token for authentication, defaults to None. | ||
:type hf_token: Optional[str], optional | ||
:raises EntryNotFoundError: If the specified entry is not found in the repository. | ||
:raises IsADirectoryError: If the specified entry is a directory. | ||
:return: True if the tar archive is valid, False otherwise. | ||
:rtype: bool | ||
""" | ||
hf_client = get_hf_client(hf_token) | ||
|
||
items = list(hf_client.get_paths_info( | ||
repo_id=repo_id, | ||
repo_type=repo_type, | ||
paths=[archive_in_repo], | ||
revision=revision, | ||
)) | ||
if len(items) == 0: | ||
raise EntryNotFoundError(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} not found.') | ||
elif not isinstance(items[0], RepoFile): | ||
raise IsADirectoryError(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} is a directory, not a file.') | ||
else: | ||
item = items[0] | ||
|
||
try: | ||
index = hf_tar_get_index( | ||
repo_id=repo_id, | ||
archive_in_repo=archive_in_repo, | ||
repo_type=repo_type, | ||
revision=revision, | ||
|
||
idx_repo_id=idx_repo_id, | ||
idx_file_in_repo=idx_file_in_repo, | ||
idx_repo_type=idx_repo_type, | ||
idx_revision=idx_revision, | ||
|
||
hf_token=hf_token, | ||
) | ||
except (EntryNotFoundError, RepositoryNotFoundError): | ||
return False | ||
|
||
return hf_tar_item_validate( | ||
file_item=item, | ||
size=index['filesize'], | ||
hash_=index.get('hash'), | ||
hash_lfs=index.get('hash_lfs'), | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import pytest | ||
from huggingface_hub.utils import EntryNotFoundError | ||
|
||
from hfutils.index import hf_tar_validate | ||
|
||
|
||
@pytest.mark.unittest | ||
class TestIndexValidate: | ||
def test_hf_tar_file_download_lfs(self): | ||
assert hf_tar_validate( | ||
repo_id='narugo/test_cos5t_tars', | ||
archive_in_repo='mashu_skins.tar', | ||
) | ||
|
||
def test_hf_tar_file_download_lfs_extra(self): | ||
assert not hf_tar_validate( | ||
repo_id='narugo/test_cos5t_tars', | ||
archive_in_repo='mashu_skins.tar', | ||
idx_file_in_repo='ex3.json' | ||
) | ||
|
||
def test_hf_tar_file_download_lfs_not_found(self): | ||
with pytest.raises(EntryNotFoundError): | ||
hf_tar_validate( | ||
repo_id='narugo/test_cos5t_tars', | ||
archive_in_repo='mashu_skins_not_found.tar', | ||
) | ||
|
||
def test_hf_tar_file_download_lfs_not_found_idx_repo(self): | ||
assert not hf_tar_validate( | ||
repo_id='narugo/test_cos5t_tars', | ||
archive_in_repo='mashu_skins.tar', | ||
idx_repo_id='narugo/repo_not_found' | ||
) | ||
|
||
def test_hf_tar_file_download_lfs_not_found_idx_index(self): | ||
assert not hf_tar_validate( | ||
repo_id='narugo/test_cos5t_tars', | ||
archive_in_repo='mashu_skins.tar', | ||
idx_file_in_repo='mashu_skins_not_found.json', | ||
) | ||
|
||
def test_hf_tar_file_download_lfs_is_directory(self): | ||
with pytest.raises(IsADirectoryError): | ||
hf_tar_validate( | ||
repo_id='narugo/test_cos5t_tars', | ||
archive_in_repo='1001-1500', | ||
) |