Skip to content

Commit

Permalink
Merge pull request #12 from deepghs/dev/tar
Browse files Browse the repository at this point in the history
dev(narugo): add more tar tests
  • Loading branch information
narugo1992 authored Apr 21, 2024
2 parents dc33b1b + 8ebb7fc commit 4797fb9
Show file tree
Hide file tree
Showing 7 changed files with 270 additions and 13 deletions.
1 change: 1 addition & 0 deletions docs/source/api_doc/index/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ hfutils.index

fetch
make
validate

22 changes: 22 additions & 0 deletions docs/source/api_doc/index/validate.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
hfutils.index.validate
================================

.. currentmodule:: hfutils.index.validate

.. automodule:: hfutils.index.validate


hf_tar_item_validate
--------------------------------------

.. autofunction:: hf_tar_item_validate



hf_tar_validate
--------------------------------------

.. autofunction:: hf_tar_validate



1 change: 1 addition & 0 deletions hfutils/index/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \
ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch
from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory
from .validate import hf_tar_item_validate, hf_tar_validate
48 changes: 35 additions & 13 deletions hfutils/index/make.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,25 +92,27 @@ def tar_create_index(src_tar_file, dst_index_file: Optional[str] = None,
return dst_index_file


def hf_tar_create_index(repo_id: str, filename: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main',
idx_repo_id: Optional[str] = None, idx_filename: Optional[str] = None,
def hf_tar_create_index(repo_id: str, archive_in_repo: str,
repo_type: RepoTypeTyping = 'dataset', revision: str = 'main',
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None,
idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None,
chunk_for_hash: int = 1 << 20, with_hash: bool = True, hf_token: Optional[str] = None):
chunk_for_hash: int = 1 << 20, with_hash: bool = True, skip_when_synced: bool = True,
hf_token: Optional[str] = None, ):
"""
Create an index file for a tar archive file in a Hugging Face repository.
:param repo_id: The identifier of the repository.
:type repo_id: str
:param filename: The path to the tar archive file.
:type filename: str
:param archive_in_repo: The path to the tar archive file.
:type archive_in_repo: str
:param repo_type: The type of the Hugging Face repository, defaults to 'dataset'.
:type repo_type: RepoTypeTyping, optional
:param revision: The revision of the repository, defaults to 'main'.
:type revision: str, optional
:param idx_repo_id: The identifier of the index repository, defaults to None.
:type idx_repo_id: str, optional
:param idx_filename: The path to save the index file in the index repository, defaults to None.
:type idx_filename: str, optional
:param idx_file_in_repo: The path to save the index file in the index repository, defaults to None.
:type idx_file_in_repo: str, optional
:param idx_repo_type: The type of the index repository, defaults to None.
:type idx_repo_type: RepoTypeTyping, optional
:param idx_revision: The revision of the index repository, defaults to None.
Expand All @@ -119,31 +121,51 @@ def hf_tar_create_index(repo_id: str, filename: str, repo_type: RepoTypeTyping =
:type chunk_for_hash: int, optional
:param with_hash: Whether to include file hashes in the index, defaults to True.
:type with_hash: bool, optional
:param skip_when_synced: Skip syncing when index is ready, defaults to True.
:type skip_when_synced: bool
:param hf_token: The Hugging Face access token, defaults to None.
:type hf_token: str, optional
"""
body, _ = os.path.splitext(archive_in_repo)
default_index_filename = f'{body}.json'

from .validate import hf_tar_validate
if skip_when_synced and hf_tar_validate(
repo_id=repo_id,
repo_type=repo_type,
archive_in_repo=archive_in_repo,
revision=revision,

idx_repo_id=idx_repo_id or repo_id,
idx_repo_type=idx_repo_type or repo_type,
idx_file_in_repo=idx_file_in_repo or default_index_filename,
idx_revision=idx_revision or revision,

hf_token=hf_token,
):
logging.info(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} already indexed, skipped.')
return

with TemporaryDirectory() as td:
local_tar_file = os.path.join(td, os.path.basename(filename))
local_tar_file = os.path.join(td, os.path.basename(archive_in_repo))
download_file_to_file(
repo_id=repo_id,
repo_type=repo_type,
file_in_repo=filename,
file_in_repo=archive_in_repo,
local_file=local_tar_file,
revision=revision,
hf_token=hf_token,
)
dst_index_file = tar_create_index(local_tar_file, chunk_for_hash=chunk_for_hash, with_hash=with_hash)

body, _ = os.path.splitext(filename)
default_index_filename = f'{body}.json'
upload_file_to_file(
repo_id=idx_repo_id or repo_id,
repo_type=idx_repo_type or repo_type,
file_in_repo=idx_filename or default_index_filename,
file_in_repo=idx_file_in_repo or default_index_filename,
local_file=dst_index_file,
revision=idx_revision or revision,
hf_token=hf_token,
message=f'Create index for {repo_type}s/{repo_id}@{revision}/{filename}',
message=f'Create index for {repo_type}s/{repo_id}@{revision}/{archive_in_repo}',
)


Expand Down
111 changes: 111 additions & 0 deletions hfutils/index/validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
from typing import Optional

from huggingface_hub.hf_api import RepoFile
from huggingface_hub.utils import EntryNotFoundError, RepositoryNotFoundError

from .fetch import hf_tar_get_index
from ..operate.base import RepoTypeTyping, get_hf_client


def hf_tar_item_validate(file_item: RepoFile, size: int, hash_: Optional[str] = None, hash_lfs: Optional[str] = None):
"""
Validate a file item in a tar archive.
This function checks if the file item matches the expected size and hash.
:param file_item: The file item from the Hugging Face repository.
:type file_item: RepoFile
:param size: The expected size of the file.
:type size: int
:param hash_: The expected SHA-1 hash of the file.
:type hash_: str, optional
:param hash_lfs: The expected SHA-256 hash of the file if stored in LFS.
:type hash_lfs: str, optional
:return: True if the file item is valid, False otherwise.
:rtype: bool
"""
# size not match
if (file_item.lfs and size != file_item.lfs.size) or \
(not file_item.lfs and size != file_item.size):
return False

# compare tar file hash
item_hashes = [file_item.blob_id]
if file_item.lfs:
item_hashes.append(file_item.lfs.sha256)
item_hashes = set(filter(bool, item_hashes))
cmp_hashes = [hash_, hash_lfs]
cmp_hashes = set(filter(bool, cmp_hashes))
return bool(cmp_hashes & item_hashes)


def hf_tar_validate(repo_id: str, archive_in_repo: str, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main',
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None,
idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None,
hf_token: Optional[str] = None):
"""
Validate a tar archive in a Hugging Face repository.
This function validates if the tar archive in the Hugging Face repository matches the expected size and hash.
:param repo_id: The ID of the Hugging Face repository.
:type repo_id: str
:param archive_in_repo: The path to the tar archive in the repository.
:type archive_in_repo: str
:param repo_type: The type of the Hugging Face repository, defaults to 'dataset'.
:type repo_type: RepoTypeTyping, optional
:param revision: The revision of the repository, defaults to 'main'.
:type revision: str, optional
:param idx_repo_id: The ID of the repository where the index file is stored.
:type idx_repo_id: Optional[str], optional
:param idx_file_in_repo: The path to the index file in the repository.
:type idx_file_in_repo: Optional[str], optional
:param idx_repo_type: The type of the repository where the index file is stored.
:type idx_repo_type: Optional[RepoTypeTyping], optional
:param idx_revision: The revision of the repository where the index file is stored.
:type idx_revision: Optional[str], optional
:param hf_token: The Hugging Face token for authentication, defaults to None.
:type hf_token: Optional[str], optional
:raises EntryNotFoundError: If the specified entry is not found in the repository.
:raises IsADirectoryError: If the specified entry is a directory.
:return: True if the tar archive is valid, False otherwise.
:rtype: bool
"""
hf_client = get_hf_client(hf_token)

items = list(hf_client.get_paths_info(
repo_id=repo_id,
repo_type=repo_type,
paths=[archive_in_repo],
revision=revision,
))
if len(items) == 0:
raise EntryNotFoundError(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} not found.')
elif not isinstance(items[0], RepoFile):
raise IsADirectoryError(f'Entry {repo_type}s/{repo_id}/{archive_in_repo} is a directory, not a file.')
else:
item = items[0]

try:
index = hf_tar_get_index(
repo_id=repo_id,
archive_in_repo=archive_in_repo,
repo_type=repo_type,
revision=revision,

idx_repo_id=idx_repo_id,
idx_file_in_repo=idx_file_in_repo,
idx_repo_type=idx_repo_type,
idx_revision=idx_revision,

hf_token=hf_token,
)
except (EntryNotFoundError, RepositoryNotFoundError):
return False

return hf_tar_item_validate(
file_item=item,
size=index['filesize'],
hash_=index.get('hash'),
hash_lfs=index.get('hash_lfs'),
)
52 changes: 52 additions & 0 deletions test/index/test_make.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import json
import os.path

import pytest
from hbutils.testing import isolated_directory
Expand Down Expand Up @@ -64,3 +65,54 @@ def test_tar_create_index(self, raw_tar):
'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1',
'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a'
}

def test_tar_create_index_subdir(self, raw_tar):
with isolated_directory({os.path.join('subdir', 'raw.tar'): raw_tar}):
tar_create_index(os.path.join('subdir', 'raw.tar'))
with open(os.path.join('subdir', 'raw.json'), 'r') as f:
assert json.load(f) == {
'files': {
'1.txt': {
'offset': 3584,
'sha256': '57a67d463dde06dcf3bf3bd8382ebf5c8d6e0a854135914e215f09fc0e1080b9',
'size': 13
},
'README.md': {
'offset': 1536,
'sha256': '75fae9f83087725e606ed7bf243a6655b1ddf583919529b3291980322b62af77',
'size': 51
},
'subdir/script.py': {
'offset': 5632,
'sha256': '5c3086e72529e59e42002f11bbfabc40b084981daedb1a3d4a31623122fd8867',
'size': 33
}
},
'filesize': 10240,
'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1',
'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a'
}

def test_tar_create_index_subdir_no_hash(self, raw_tar):
with isolated_directory({os.path.join('subdir', 'raw.tar'): raw_tar}):
tar_create_index(os.path.join('subdir', 'raw.tar'), with_hash=False)
with open(os.path.join('subdir', 'raw.json'), 'r') as f:
assert json.load(f) == {
'files': {
'1.txt': {
'offset': 3584,
'size': 13
},
'README.md': {
'offset': 1536,
'size': 51
},
'subdir/script.py': {
'offset': 5632,
'size': 33
}
},
'filesize': 10240,
'hash': '55d6e39981cd94f0d9732b40ff677a508d6652a1',
'hash_lfs': 'be9ae98f74065d2df47f38263644941532b6615b5a60c34db8cc864b4ade147a'
}
48 changes: 48 additions & 0 deletions test/index/test_validate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pytest
from huggingface_hub.utils import EntryNotFoundError

from hfutils.index import hf_tar_validate


@pytest.mark.unittest
class TestIndexValidate:
def test_hf_tar_file_download_lfs(self):
assert hf_tar_validate(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins.tar',
)

def test_hf_tar_file_download_lfs_extra(self):
assert not hf_tar_validate(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins.tar',
idx_file_in_repo='ex3.json'
)

def test_hf_tar_file_download_lfs_not_found(self):
with pytest.raises(EntryNotFoundError):
hf_tar_validate(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins_not_found.tar',
)

def test_hf_tar_file_download_lfs_not_found_idx_repo(self):
assert not hf_tar_validate(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins.tar',
idx_repo_id='narugo/repo_not_found'
)

def test_hf_tar_file_download_lfs_not_found_idx_index(self):
assert not hf_tar_validate(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='mashu_skins.tar',
idx_file_in_repo='mashu_skins_not_found.json',
)

def test_hf_tar_file_download_lfs_is_directory(self):
with pytest.raises(IsADirectoryError):
hf_tar_validate(
repo_id='narugo/test_cos5t_tars',
archive_in_repo='1001-1500',
)

0 comments on commit 4797fb9

Please sign in to comment.