Skip to content

Commit

Permalink
Merge pull request #36 from deepghs/dev/indexsafe
Browse files Browse the repository at this point in the history
dev(narugo): add lock for hf_tar_get_index
  • Loading branch information
narugo1992 authored Aug 5, 2024
2 parents cd3599e + 60fa72a commit 44158f3
Showing 1 changed file with 17 additions and 7 deletions.
24 changes: 17 additions & 7 deletions hfutils/index/fetch.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import json
import os.path
import threading
from collections import defaultdict
from typing import Optional, Dict, Union, List

from huggingface_hub.file_download import http_get, hf_hub_url
Expand All @@ -21,6 +23,9 @@ class ArchiveStandaloneFileHashNotMatch(Exception):
"""


_HF_TAR_IDX_LOCKS = defaultdict(threading.Lock)


def hf_tar_get_index(repo_id: str, archive_in_repo: str,
repo_type: RepoTypeTyping = 'dataset', revision: str = 'main',
idx_repo_id: Optional[str] = None, idx_file_in_repo: Optional[str] = None,
Expand Down Expand Up @@ -83,13 +88,18 @@ def hf_tar_get_index(repo_id: str, archive_in_repo: str,
hf_client = get_hf_client(hf_token)
body, _ = os.path.splitext(archive_in_repo)
default_index_file = f'{body}.json'
with open(hf_client.hf_hub_download(
repo_id=idx_repo_id or repo_id,
repo_type=idx_repo_type or repo_type,
filename=idx_file_in_repo or default_index_file,
revision=idx_revision or revision,
), 'r') as f:
return json.load(f)
f_repo_id = idx_repo_id or repo_id
f_repo_type = idx_repo_type or repo_type
f_filename = idx_file_in_repo or default_index_file
f_revision = idx_revision or revision
with _HF_TAR_IDX_LOCKS[(f_repo_id, f_repo_type, f_filename, f_revision)]:
with open(hf_client.hf_hub_download(
repo_id=f_repo_id,
repo_type=f_repo_type,
filename=f_filename,
revision=f_revision,
), 'r') as f:
return json.load(f)


def hf_tar_list_files(repo_id: str, archive_in_repo: str,
Expand Down

0 comments on commit 44158f3

Please sign in to comment.