Skip to content

Commit

Permalink
dev(narugo): save local index code
Browse files Browse the repository at this point in the history
  • Loading branch information
narugo1992 committed Aug 1, 2024
1 parent 1ce0c59 commit 9326294
Show file tree
Hide file tree
Showing 5 changed files with 250 additions and 2 deletions.
1 change: 1 addition & 0 deletions hfutils/index/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from .fetch import hf_tar_list_files, hf_tar_file_download, hf_tar_get_index, hf_tar_file_exists, \
ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch, hf_tar_file_size, hf_tar_file_info
from .local_fetch import tar_get_index, tar_file_info, tar_file_download, tar_file_size, tar_file_exists, tar_list_files
from .make import tar_create_index, hf_tar_create_index, tar_get_index_info, hf_tar_create_from_directory
from .validate import hf_tar_item_validate, hf_tar_validate
4 changes: 2 additions & 2 deletions hfutils/index/fetch.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import json
import os.path
from typing import Optional, Dict, Union
from typing import Optional, Dict, Union, List

from huggingface_hub.file_download import http_get, hf_hub_url
from huggingface_hub.utils import build_hf_headers
Expand Down Expand Up @@ -96,7 +96,7 @@ def hf_tar_list_files(repo_id: str, archive_in_repo: str,
repo_type: RepoTypeTyping = 'dataset', revision: str = 'main',
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None,
idx_repo_type: Optional[RepoTypeTyping] = None, idx_revision: Optional[str] = None,
hf_token: Optional[str] = None):
hf_token: Optional[str] = None) -> List[str]:
"""
List files inside a tar archive file in a Hugging Face repository.
Expand Down
96 changes: 96 additions & 0 deletions hfutils/index/local_fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json
import os
from typing import Optional, List


def tar_get_index(archive_file: str, idx_file: Optional[str] = None):
body, _ = os.path.splitext(archive_file)
default_index_file = f'{body}.json'
with open(idx_file or default_index_file, 'r') as f:
return json.load(f)


def tar_list_files(archive_file: str, idx_file: Optional[str] = None) -> List[str]:
index_data = tar_get_index(
archive_file=archive_file,
idx_file=idx_file,
)
return list(index_data['files'].keys())


def tar_file_exists(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> bool:
from .fetch import _hf_files_process, _n_path
index = tar_get_index(
archive_file=archive_file,
idx_file=idx_file,
)
files = _hf_files_process(index['files'])
return _n_path(file_in_archive) in files


def tar_file_info(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> dict:
from .fetch import _hf_files_process, _n_path
index = tar_get_index(
archive_file=archive_file,
idx_file=idx_file,
)
files = _hf_files_process(index['files'])
if _n_path(file_in_archive) not in files:
raise FileNotFoundError(f'File {file_in_archive!r} not found '
f'in local archive {archive_file!r}.')
else:
return files[_n_path(file_in_archive)]


def tar_file_size(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> int:
return tar_file_info(
archive_file=archive_file,
file_in_archive=file_in_archive,
idx_file=idx_file,
)['size']


def tar_file_download(archive_file: str, file_in_archive: str, local_file: str,
idx_file: Optional[str] = None, chunk_size: int = 1 << 20):
from .fetch import _hf_files_process, _n_path, _f_sha256, \
ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch

index = tar_get_index(
archive_file=archive_file,
idx_file=idx_file,
)
files = _hf_files_process(index['files'])
if _n_path(file_in_archive) not in files:
raise FileNotFoundError(f'File {file_in_archive!r} not found '
f'in local archive {archive_file!r}.')

info = files[_n_path(file_in_archive)]

if os.path.dirname(local_file):
os.makedirs(os.path.dirname(local_file), exist_ok=True)
try:
with open(local_file, 'wb') as wf:
if info['size'] > 0:
with open(archive_file, 'rb') as rf:
rf.seek(info['offset'])
tp = info['offset'] + info['size']
while rf.tell() < tp:
read_bytes = min(tp - rf.tell(), chunk_size)
wf.write(rf.read(read_bytes))

if os.path.getsize(local_file) != info['size']:
raise ArchiveStandaloneFileIncompleteDownload(
f'Expected size is {info["size"]}, but actually {os.path.getsize(local_file)} downloaded.'
)

if info.get('sha256'):
_sha256 = _f_sha256(local_file)
if _sha256 != info['sha256']:
raise ArchiveStandaloneFileHashNotMatch(
f'Expected hash is {info["sha256"]!r}, but actually {_sha256!r} found.'
)

except Exception:
if os.path.exists(local_file):
os.remove(local_file)
raise
15 changes: 15 additions & 0 deletions test/index/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import pytest
from hbutils.system import TemporaryDirectory

from hfutils.operate import download_directory_as_directory


@pytest.fixture(scope='module')
def local_narugo_test_cos5t_tars():
with TemporaryDirectory() as td:
download_directory_as_directory(
repo_id='narugo/test_cos5t_tars',
repo_type='dataset',
local_directory=td,
)
yield td
136 changes: 136 additions & 0 deletions test/index/test_local_fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
import os.path

import pytest
from hbutils.testing import isolated_directory
from natsort import natsorted

from hfutils.index import tar_list_files, tar_file_exists, tar_file_download, tar_file_info, \
tar_file_size
from test.testings import get_testfile, file_compare


@pytest.mark.unittest
class TestIndexLocalFetch:
def test_tar_list_files(self, local_narugo_test_cos5t_tars):
files = tar_list_files(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
)
assert len(files) == 17
assert natsorted(files) == [
'.meta.json', 'Bright_Voyager.png', 'Grail_League_1星.png', 'Grail_League_2星.png', 'Grail_League_3星.png',
'Grail_League_4星.png', 'Grail_League_5星.png', '奥特瑙斯.png', '奥特瑙斯_改建型.png', '常夏的泳装.png',
'常夏的泳装Ver_02.png', '愚人节.png', '愚人节_奥特瑙斯.png', '第1阶段.png', '第2阶段.png', '第3阶段.png',
'第4阶段.png'
]

def test_tar_file_exists(self, local_narugo_test_cos5t_tars):
assert tar_file_exists(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='.meta.json'
)
assert tar_file_exists(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='愚人节_奥特瑙斯.png'
)
assert tar_file_exists(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='./愚人节_奥特瑙斯.png'
)
assert not tar_file_exists(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='愚人节奥特瑙斯.png'
)

def test_tar_file_info(self, local_narugo_test_cos5t_tars):
assert tar_file_info(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='.meta.json'
) == {
'offset': 2725376,
'sha256': '4585b01c251a496b73cb231d29fc711cfb1d682a84334d95f6f5b6c1cc5b5222',
'size': 8968
}
assert tar_file_info(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='愚人节_奥特瑙斯.png'
) == {
'offset': 3954176,
'sha256': '991497fa586f6f4529827e0f8f1f228c20ec9fb507c314ee9d20d47c46f26e89',
'size': 255276
}
assert tar_file_info(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='./愚人节_奥特瑙斯.png'
) == {
'offset': 3954176,
'sha256': '991497fa586f6f4529827e0f8f1f228c20ec9fb507c314ee9d20d47c46f26e89',
'size': 255276
}
with pytest.raises(FileNotFoundError):
_ = tar_file_info(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='愚人节奥特瑙斯.png'
)

def test_tar_file_size(self, local_narugo_test_cos5t_tars):
assert tar_file_size(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='.meta.json'
) == 8968
assert tar_file_size(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='愚人节_奥特瑙斯.png'
) == 255276
assert tar_file_size(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='./愚人节_奥特瑙斯.png'
) == 255276
with pytest.raises(FileNotFoundError):
_ = tar_file_size(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='愚人节奥特瑙斯.png'
)

def test_tar_file_download_small(self, local_narugo_test_cos5t_tars):
with isolated_directory():
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='.meta.json',
local_file='.meta.json'
)
file_compare(get_testfile('skin_mashu', '.meta.json'), '.meta.json')

def test_tar_file_download_lfs(self, local_narugo_test_cos5t_tars):
with isolated_directory():
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='./愚人节_奥特瑙斯.png',
local_file='愚人节_奥特瑙斯.png'
)
file_compare(get_testfile('skin_mashu', '愚人节_奥特瑙斯.png'), '愚人节_奥特瑙斯.png')

def test_tar_file_download_not_found(self, local_narugo_test_cos5t_tars):
with isolated_directory(), pytest.raises(FileNotFoundError):
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'mashu_skins.tar'),
file_in_archive='./愚人节奥特瑙斯.png',
local_file='愚人节_奥特瑙斯.png'
)

def test_tar_file_download_subdir(self, local_narugo_test_cos5t_tars):
with isolated_directory():
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'ex3.tar'),
file_in_archive='artoria_caster_third_ascension_fate/sankaku_21305298.jpg',
local_file='f/ac.jpg'
)
file_compare(get_testfile('sankaku_21305298.jpg'), 'f/ac.jpg')

def test_tar_file_download_empty(self, local_narugo_test_cos5t_tars):
with isolated_directory():
tar_file_download(
archive_file=os.path.join(local_narugo_test_cos5t_tars, 'empty_file.tar'),
file_in_archive='empty_file',
local_file='empty_file',
)
assert os.path.getsize('empty_file') == 0

0 comments on commit 9326294

Please sign in to comment.