diff --git a/.github/workflows/ir_repos.yml b/.github/workflows/ir_repos.yml new file mode 100644 index 0000000000..c3438df2bc --- /dev/null +++ b/.github/workflows/ir_repos.yml @@ -0,0 +1,116 @@ +#file: noinspection YAMLSchemaValidation +name: Irregular Repos + +on: + workflow_dispatch: + schedule: + - cron: '0 * * * 0' + +jobs: + check_irregular_repo: + name: Check Repos + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: + - 'ubuntu-latest' + python-version: + - '3.8' + + steps: + - name: Get system version for Linux + if: ${{ contains(matrix.os, 'ubuntu') }} + shell: bash + run: | + echo "OS_NAME=Linux" >> $GITHUB_ENV + echo "IS_WIN=" >> $GITHUB_ENV + echo "IS_MAC=" >> $GITHUB_ENV + - name: Get system version for Windows + if: ${{ contains(matrix.os, 'windows') }} + shell: bash + run: | + echo "OS_NAME=Windows" >> $GITHUB_ENV + echo "IS_WIN=1" >> $GITHUB_ENV + echo "IS_MAC=" >> $GITHUB_ENV + - name: Get system version for MacOS + if: ${{ contains(matrix.os, 'macos') }} + shell: bash + run: | + echo "OS_NAME=MacOS" >> $GITHUB_ENV + echo "IS_WIN=" >> $GITHUB_ENV + echo "IS_MAC=1" >> $GITHUB_ENV + - name: Set environment for Cpython + if: ${{ !contains(matrix.python-version, 'pypy') }} + shell: bash + run: | + echo "IS_PYPY=" >> $GITHUB_ENV + - name: Set environment for PyPy + if: ${{ contains(matrix.python-version, 'pypy') }} + shell: bash + run: | + echo "IS_PYPY=1" >> $GITHUB_ENV + - name: Checkout code + uses: actions/checkout@v3.3.0 + with: + fetch-depth: 20 + submodules: 'recursive' + - name: Set up system dependences on Linux + if: ${{ env.OS_NAME == 'Linux' }} + shell: bash + run: | + sudo apt-get update + sudo apt-get install -y tree cloc wget curl make zip + sudo apt-get install -y git-lfs + sudo apt-get install p7zip-full rar unrar + - name: Set up system dependences on Windows + if: ${{ env.OS_NAME == 'Windows' }} + shell: bash + run: | + choco install tree cloc wget curl make zip + choco install 7zip winrar # unrar should be added + - name: Set up system dependences on MacOS + if: ${{ env.OS_NAME == 'MacOS' }} + run: | + brew install tree cloc wget curl make zip + brew install sevenzip + brew install --cask rar + - name: Set up python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} + - name: Install dependencies + shell: bash + run: | + python -m pip install --upgrade pip + pip install --upgrade flake8 setuptools wheel twine + pip install -r requirements.txt + pip install -r requirements-test.txt + - name: Test the basic environment + shell: bash + run: | + python -V + pip --version + pip list + tree . + cloc hfutils + cloc test + - name: Run unittest + env: + CI: 'true' + HF_TOKEN: ${{ secrets.HF_TOKEN }} + shell: bash + run: | + python -m tools.irregular_repo + - name: Change Commit + id: commit + run: | + git config user.name 'narugo1992' + git config user.email 'narugo992@gmail.com' + git add -A + git diff-index --quiet HEAD || git commit -a -m "dev(narugo): auto sync irregular repositories $(date -R)" + - name: Push changes + uses: ad-m/github-push-action@master + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + branch: ${{ github.ref }} diff --git a/.gitignore b/.gitignore index e2e99e4fde..211e41fe16 100644 --- a/.gitignore +++ b/.gitignore @@ -1223,4 +1223,5 @@ fabric.properties *.pt /runs /YOLOv8 -.benchmarks \ No newline at end of file +.benchmarks +!/hfutils/utils/irregular_repo.json \ No newline at end of file diff --git a/docs/source/api_doc/utils/index.rst b/docs/source/api_doc/utils/index.rst index c3f0bb5c97..ecb408601e 100644 --- a/docs/source/api_doc/utils/index.rst +++ b/docs/source/api_doc/utils/index.rst @@ -11,6 +11,7 @@ hfutils.utils binary download + path tqdm_ walk diff --git a/docs/source/api_doc/utils/path.rst b/docs/source/api_doc/utils/path.rst new file mode 100644 index 0000000000..4418c54259 --- /dev/null +++ b/docs/source/api_doc/utils/path.rst @@ -0,0 +1,36 @@ +hfutils.utils.path +================================= + +.. currentmodule:: hfutils.utils.path + +.. automodule:: hfutils.utils.path + + +hf_normpath +-------------------------------------------- + +.. autofunction:: hf_normpath + + + +hf_fs_path +-------------------------------------------- + +.. autofunction:: hf_fs_path + + + +parse_hf_fs_path +-------------------------------------------- + +.. autofunction:: parse_hf_fs_path + + + +HfFileSystemPath +-------------------------------------------- + +.. autoclass:: HfFileSystemPath + + + diff --git a/hfutils/operate/base.py b/hfutils/operate/base.py index bea4b6b7a8..85a9ae49c5 100644 --- a/hfutils/operate/base.py +++ b/hfutils/operate/base.py @@ -5,6 +5,8 @@ from huggingface_hub import HfApi, HfFileSystem +from hfutils.utils import parse_hf_fs_path + RepoTypeTyping = Literal['dataset', 'model', 'space'] REPO_TYPES = ['dataset', 'model', 'space'] @@ -113,12 +115,15 @@ def list_files_in_repository(repo_id: str, repo_type: RepoTypeTyping = 'dataset' try: _exist_files = [ - os.path.relpath(file, repo_root_path) + parse_hf_fs_path(file).filename for file in hf_fs.glob(f'{repo_root_path}/**', revision=revision) ] except FileNotFoundError: return [] - _exist_ps = sorted([(file, file.split(os.sep)) for file in _exist_files], key=lambda x: x[1]) + if subdir and subdir != '.': + _exist_files = [os.path.relpath(file, subdir) for file in _exist_files] + + _exist_ps = sorted([(file, file.split('/')) for file in _exist_files], key=lambda x: x[1]) retval = [] for i, (file, segments) in enumerate(_exist_ps): if i < len(_exist_ps) - 1 and segments == _exist_ps[i + 1][1][:len(segments)]: diff --git a/hfutils/utils/__init__.py b/hfutils/utils/__init__.py index 275591104c..7fd98e3d25 100644 --- a/hfutils/utils/__init__.py +++ b/hfutils/utils/__init__.py @@ -1,6 +1,6 @@ from .binary import is_binary_file from .download import download_file -from .path import hf_normpath +from .path import hf_normpath, hf_fs_path, parse_hf_fs_path, HfFileSystemPath from .temp import TemporaryDirectory from .tqdm_ import tqdm from .walk import walk_files diff --git a/hfutils/utils/irregular_repo.json b/hfutils/utils/irregular_repo.json new file mode 100644 index 0000000000..7378355758 --- /dev/null +++ b/hfutils/utils/irregular_repo.json @@ -0,0 +1,662 @@ +{ + "datasets": [ + "acronym_identification", + "ade_corpus_v2", + "aeslc", + "afrikaans_ner_corpus", + "ag_news", + "air_dialogue", + "ajgt_twitter_ar", + "allegro_reviews", + "allocine", + "alt", + "amazon_polarity", + "amazon_reviews_multi", + "amazon_us_reviews", + "ambig_qa", + "ami", + "amttl", + "app_reviews", + "aqua_rat", + "aquamuse", + "ar_res_reviews", + "ar_sarcasm", + "arabic_billion_words", + "arabic_pos_dialect", + "arabic_speech_corpus", + "arcd", + "arsentd_lev", + "art", + "arxiv_dataset", + "ascent_kb", + "aslg_pc12", + "asnq", + "assin", + "assin2", + "atomic", + "autshumato", + "banking77", + "bbaw_egyptian", + "bbc_hindi_nli", + "bc2gm_corpus", + "best2009", + "bible_para", + "big_patent", + "billsum", + "bing_coronavirus_query_set", + "biomrc", + "biosses", + "blended_skill_talk", + "blog_authorship_corpus", + "bn_hate_speech", + "bnl_newspapers", + "bookcorpus", + "bookcorpusopen", + "bprec", + "break_data", + "brwac", + "bsd_ja_en", + "bswac", + "c3", + "c4", + "cail2018", + "caner", + "casino", + "catalonia_independence", + "cats_vs_dogs", + "cawac", + "cbt", + "cc100", + "ccaligned_multilingual", + "cdsc", + "cdt", + "cedr", + "cfq", + "chr_en", + "cifar10", + "cifar100", + "circa", + "clickbait_news_bg", + "climate_fever", + "clinc_oos", + "clue", + "cmrc2018", + "cmu_hinglish_dog", + "cnn_dailymail", + "coached_conv_pref", + "coarse_discourse", + "codah", + "code_search_net", + "code_x_glue_cc_clone_detection_big_clone_bench", + "code_x_glue_cc_clone_detection_poj104", + "code_x_glue_cc_cloze_testing_all", + "code_x_glue_cc_cloze_testing_maxmin", + "code_x_glue_cc_code_completion_line", + "code_x_glue_cc_code_completion_token", + "code_x_glue_cc_code_refinement", + "code_x_glue_cc_code_to_code_trans", + "code_x_glue_cc_defect_detection", + "code_x_glue_ct_code_to_text", + "code_x_glue_tc_nl_code_search_adv", + "code_x_glue_tc_text_to_code", + "code_x_glue_tt_text_to_text", + "com_qa", + "common_language", + "common_voice", + "compguesswhat", + "conceptnet5", + "conll2000", + "conll2002", + "conll2003", + "conllpp", + "consumer-finance-complaints", + "conv_ai", + "conv_ai_2", + "conv_ai_3", + "conv_questions", + "cornell_movie_dialog", + "cos_e", + "cosmos_qa", + "counter", + "covid_qa_castorini", + "covid_qa_deepset", + "covid_qa_ucsd", + "covid_tweets_japanese", + "covost2", + "cppe-5", + "craigslist_bargains", + "crawl_domain", + "crd3", + "crime_and_punish", + "crows_pairs", + "cryptonite", + "cs_restaurants", + "cuad", + "curiosity_dialogs", + "daily_dialog", + "dane", + "danish_political_comments", + "dart", + "datacommons_factcheck", + "dbrd", + "deal_or_no_dialog", + "definite_pronoun_resolution", + "dengue_filipino", + "dialog_re", + "diplomacy_detection", + "disaster_response_messages", + "discofuse", + "discovery", + "disfl_qa", + "doc2dial", + "docred", + "doqa", + "dream", + "dutch_social", + "dyk", + "e2e_nlg", + "e2e_nlg_cleaned", + "ecb", + "ecthr_cases", + "eduge", + "ehealth_kd", + "electricity_load_diagrams", + "eli5", + "eli5_category", + "emea", + "emo", + "emotone_ar", + "empathetic_dialogues", + "enriched_web_nlg", + "eraser_multi_rc", + "esnli", + "eth_py150_open", + "ethos", + "eu_regulatory_ir", + "eurlex", + "euronews", + "europa_eac_tm", + "europa_ecdc_tm", + "event2Mind", + "evidence_infer_treatment", + "factckbr", + "fake_news_english", + "fake_news_filipino", + "farsi_news", + "fashion_mnist", + "fever", + "few_rel", + "financial_phrasebank", + "finer", + "flores", + "flue", + "food101", + "fquad", + "freebase_qa", + "gap", + "gem", + "generated_reviews_enth", + "generics_kb", + "german_legal_entity_recognition", + "germaner", + "germeval_14", + "giga_fren", + "gigaword", + "glucose", + "gnad10", + "go_emotions", + "gooaq", + "google_wellformed_query", + "grail_qa", + "great_code", + "guardian_authorship", + "gutenberg_time", + "hans", + "hansards", + "hard", + "harem", + "has_part", + "hate_offensive", + "hate_speech18", + "hate_speech_filipino", + "hate_speech_pl", + "hate_speech_portuguese", + "hatexplain", + "hausa_voa_ner", + "hausa_voa_topics", + "hda_nli_hindi", + "head_qa", + "health_fact", + "hebrew_projectbenyehuda", + "hebrew_sentiment", + "hebrew_this_world", + "hind_encorp", + "hindi_discourse", + "hippocorpus", + "hkcancor", + "hlgd", + "hope_edi", + "hotpot_qa", + "hover", + "hrenwac_para", + "hrwac", + "humicroedit", + "hybrid_qa", + "hyperpartisan_news_detection", + "iapp_wiki_qa_squad", + "id_clickbait", + "id_liputan6", + "id_nergrit_corpus", + "id_newspapers_2018", + "id_panl_bppt", + "id_puisi", + "igbo_english_machine_translation", + "igbo_monolingual", + "igbo_ner", + "ilist", + "imdb_urdu_reviews", + "imppres", + "indic_glue", + "indonli", + "inquisitive_qg", + "interpress_news_category_tr", + "interpress_news_category_tr_lite", + "irc_disentangle", + "isixhosa_ner_corpus", + "isizulu_ner_corpus", + "iwslt2017", + "jeopardy", + "jnlpba", + "journalists_questions", + "kan_hope", + "kannada_news", + "kd_conv", + "kde4", + "kelm", + "kilt_tasks", + "kilt_wikipedia", + "kinnews_kirnews", + "klue", + "kor_3i4k", + "kor_hate", + "kor_ner", + "kor_nli", + "kor_nlu", + "kor_qpair", + "kor_sae", + "kor_sarcasm", + "labr", + "lama", + "large_spanish_corpus", + "laroseda", + "lc_quad", + "lener_br", + "liar", + "librispeech_asr", + "librispeech_lm", + "limit", + "lince", + "linnaeus", + "liveqa", + "lj_speech", + "lm1b", + "lst20", + "m_lama", + "mac_morpho", + "makhzan", + "masakhaner", + "math_dataset", + "math_qa", + "matinf", + "mbpp", + "mc4", + "mc_taco", + "md_gender_bias", + "mdd", + "med_hop", + "medal", + "medical_dialog", + "medical_questions_pairs", + "menyo20k_mt", + "meta_woz", + "metooma", + "metrec", + "miam", + "mkb", + "mkqa", + "mlqa", + "mlsum", + "mnist", + "mocha", + "moroco", + "movie_rationales", + "mrqa", + "ms_marco", + "ms_terms", + "msr_genomics_kbcomp", + "msr_sqa", + "msr_text_compression", + "msr_zhen_translation_parity", + "msra_ner", + "mt_eng_vietnamese", + "muchocine", + "multi_booked", + "multi_news", + "multi_nli_mismatch", + "multi_para_crawl", + "multi_re_qa", + "multi_woz_v22", + "multi_x_science_sum", + "multidoc2dial", + "multilingual_librispeech", + "mutual_friends", + "mwsc", + "myanmar_news", + "narrativeqa", + "narrativeqa_manual", + "natural_questions", + "ncbi_disease", + "nchlt", + "ncslgr", + "nell", + "neural_code_search", + "newsgroup", + "newsph", + "newsph_nli", + "newspop", + "newsqa", + "newsroom", + "nkjp-ner", + "nli_tr", + "nlu_evaluation_data", + "norec", + "norne", + "norwegian_ner", + "nq_open", + "nsmc", + "numer_sense", + "numeric_fused_head", + "oclar", + "offcombr", + "offenseval2020_tr", + "offenseval_dravidian", + "ofis_publik", + "ohsumed", + "ollie", + "omp", + "onestop_english", + "onestop_qa", + "open_subtitles", + "openai_humaneval", + "openslr", + "opinosis", + "orange_sum", + "oscar", + "para_crawl", + "para_pat", + "parsinlu_reading_comprehension", + "pass", + "paws-x", + "paws", + "pec", + "peoples_daily_ner", + "per_sent", + "persian_ner", + "pg19", + "php", + "pib", + "piqa", + "pn_summary", + "poem_sentiment", + "polemo2", + "poleval2019_cyberbullying", + "poleval2019_mt", + "polsum", + "polyglot_ner", + "prachathai67k", + "pragmeval", + "proto_qa", + "psc", + "ptb_text_only", + "pubmed", + "py_ast", + "qa4mre", + "qa_srl", + "qa_zre", + "qangaroo", + "qanta", + "qed", + "qed_amara", + "quac", + "quail", + "quarel", + "quora", + "quoref", + "re_dial", + "reasoning_bg", + "recipe_nlg", + "reclor", + "red_caps", + "reddit_tifu", + "refresd", + "reuters21578", + "riddle_sense", + "ro_sent", + "ro_sts", + "ro_sts_parallel", + "roman_urdu", + "ronec", + "rotten_tomatoes", + "samsum", + "sanskrit_classic", + "saudinewsnet", + "sberquad", + "scan", + "scb_mt_enth_2020", + "scene_parse_150", + "schema_guided_dstc8", + "scielo", + "scientific_papers", + "search_qa", + "sede", + "selqa", + "sem_eval_2010_task_8", + "sem_eval_2014_task_1", + "sem_eval_2018_task_1", + "sem_eval_2020_task_11", + "sent_comp", + "senti_lex", + "senti_ws", + "sentiment140", + "sepedi_ner", + "sesotho_ner_corpus", + "setimes", + "setswana_ner_corpus", + "sharc_modified", + "sick", + "silicone", + "simple_questions_v2", + "siswati_ner_corpus", + "smartdata", + "sms_spam", + "snips_built_in_intents", + "snow_simplified_japanese_corpus", + "so_stacksample", + "social_bias_frames", + "social_i_qa", + "sofc_materials_articles", + "sogou_news", + "spanish_billion_words", + "spc", + "species_800", + "speech_commands", + "squad_adversarial", + "squad_es", + "squad_it", + "squad_kor_v1", + "squad_kor_v2", + "squad_v1_pt", + "squadshifts", + "srwac", + "sst", + "story_cloze", + "stsb_mt_sv", + "style_change_detection", + "subjqa", + "super_glue", + "superb", + "svhn", + "swag", + "swahili", + "swahili_news", + "swda", + "swedish_medical_ner", + "swedish_ner_corpus", + "swedish_reviews", + "tab_fact", + "tamilmixsentiment", + "tanzil", + "tapaco", + "tashkeela", + "taskmaster1", + "taskmaster2", + "taskmaster3", + "tatoeba", + "ted_hrlr", + "ted_iwlst2013", + "ted_multi", + "ted_talks_iwslt", + "telugu_books", + "telugu_news", + "tep_en_fa_para", + "text2log", + "thai_toxicity_tweet", + "thainer", + "thaiqa_squad", + "thaisum", + "the_pile_books3", + "the_pile_openwebtext2", + "the_pile_stack_exchange", + "tilde_model", + "time_dial", + "times_of_india_news_headlines", + "timit_asr", + "tlc", + "tmu_gfm_dataset", + "told-br", + "totto", + "trec", + "tsac", + "ttc4900", + "tunizi", + "tuple_ie", + "turk", + "turkic_xwmt", + "turkish_movie_sentiment", + "turkish_ner", + "turkish_product_reviews", + "turkish_shrinked_ner", + "turku_ner_corpus", + "tweet_eval", + "tweet_qa", + "tweets_ar_en_parallel", + "tweets_hate_speech_detection", + "twi_text_c3", + "twi_wordsim353", + "tydiqa", + "ubuntu_dialogs_corpus", + "udhr", + "um005", + "universal_dependencies", + "universal_morphologies", + "urdu_fake_news", + "urdu_sentiment_corpus", + "vctk", + "vivos", + "web_nlg", + "web_of_science", + "web_questions", + "weibo_ner", + "wi_locness", + "wider_face", + "wiki40b", + "wiki_asp", + "wiki_atomic_edits", + "wiki_auto", + "wiki_bio", + "wiki_dpr", + "wiki_hop", + "wiki_lingua", + "wiki_movies", + "wiki_qa", + "wiki_qa_ar", + "wiki_snippets", + "wiki_source", + "wiki_split", + "wiki_summary", + "wikiann", + "wikicorpus", + "wikihow", + "wikipedia", + "wikisql", + "wikitext", + "wikitext_tl39", + "wili_2018", + "wino_bias", + "winograd_wsc", + "winogrande", + "wiqa", + "wisesight1000", + "wisesight_sentiment", + "wnut_17", + "wongnai_reviews", + "woz_dialogue", + "wrbsc", + "x_stance", + "xcopa", + "xcsr", + "xed_en_fi", + "xglue", + "xnli", + "xor_tydi_qa", + "xquad", + "xquad_r", + "xsum_factuality", + "xtreme", + "yahoo_answers_qa", + "yahoo_answers_topics", + "yelp_polarity", + "yelp_review_full", + "yoruba_bbc_topics", + "yoruba_gv_ner", + "yoruba_text_c3", + "yoruba_wordsim353", + "youtube_caption_corrections", + "zest", + "elkarhizketak", + "wikitablequestions", + "conll2012_ontonotesv5", + "monash_tsf", + "roman_urdu_hate_speech", + "adv_glue", + "metashift", + "gsm8k", + "sbu_captions", + "conceptual_captions", + "conceptual_12m", + "visual_genome", + "imagenet-1k", + "tne", + "textvqa", + "ett", + "imagenet_sketch", + "biwi_kinect_head_pose", + "enwik8", + "truthful_qa", + "bigbench", + "quickdraw", + "lccc" + ], + "models": [], + "spaces": [] +} \ No newline at end of file diff --git a/hfutils/utils/path.py b/hfutils/utils/path.py index b15adc19cf..8d4ab909f8 100644 --- a/hfutils/utils/path.py +++ b/hfutils/utils/path.py @@ -1,5 +1,144 @@ +import json import os +import re +from dataclasses import dataclass +from functools import lru_cache +from typing import Optional, Dict, Set, Literal +RepoTypeTyping = Literal['dataset', 'model', 'space'] -def hf_normpath(path): - return os.path.normpath(path).replace('\\', '/') + +def hf_normpath(path) -> str: + """ + Normalize a given path. + + :param path: The path to normalize. + :type path: Any + + :return: The normalized path. + :rtype: str + """ + return re.sub( + r'[\\/]+', '/', + os.path.relpath(os.path.normpath(os.path.join(os.sep, path)), os.sep) + ) + + +def hf_fs_path(repo_id: str, filename: str, + repo_type: RepoTypeTyping = 'dataset', revision: Optional[str] = None): + """ + Get the huggingface filesystem path. + + :param repo_id: The repository ID. + :type repo_id: str + + :param filename: The filename. + :type filename: str + + :param repo_type: The type of repository. (default: 'dataset') + :type repo_type: RepoTypeTyping + + :param revision: The revision of the repository. (default: None) + :type revision: Optional[str] + + :return: The huggingface filesystem path. + :rtype: str + """ + filename = hf_normpath(filename) + if repo_type == 'dataset': + prefix = 'datasets/' + elif repo_type == 'space': + prefix = 'spaces/' + else: + prefix = '' + + if revision is not None: + revision_text = f'@{revision}' + else: + revision_text = '' + + return f'{prefix}{repo_id}{revision_text}/{filename}' + + +@lru_cache() +def _irregular_repos() -> Dict[RepoTypeTyping, Set[str]]: + """ + Get irregular repositories. + + :return: A dictionary containing irregular repositories. + :rtype: Dict[RepoTypeTyping, Set[str]] + """ + with open(os.path.join(os.path.dirname(__file__), 'irregular_repo.json'), 'r') as f: + data = json.load(f) + return { + 'model': set(data['models']), + 'dataset': set(data['datasets']), + 'space': set(data['spaces']), + } + + +_RE_IR_PATH = re.compile( + r'^(?P[^@/]+)(@(?P[^@/]+))?(/(?P[\s\S]+))?$') +_RE_PATH = re.compile( + r'^(?P[^@/]+/[^@/]+)(@(?P[^@/]+))?(/(?P[\s\S]+))?$') + + +@dataclass +class HfFileSystemPath: + """ + Huggingface FileSystem Path. + + :param repo_id: The repository ID. + :type repo_id: str + + :param filename: The filename. + :type filename: str + + :param repo_type: The type of repository. + :type repo_type: RepoTypeTyping + + :param revision: The revision of the repository. + :type revision: Optional[str] + """ + repo_id: str + filename: str + repo_type: RepoTypeTyping + revision: Optional[str] + + +def parse_hf_fs_path(path: str) -> HfFileSystemPath: + """ + Parse the huggingface filesystem path. + + :param path: The path to parse. + :type path: str + + :return: The parsed huggingface filesystem path. + :rtype: HfFileSystemPath + :raises ValueError: If this path is invalid. + """ + origin_path = path + repo_type: RepoTypeTyping + if path.startswith('datasets/'): + repo_type = 'dataset' + path = path[len('datasets/'):] + elif path.startswith('spaces/'): + repo_type = 'space' + path = path[len('spaces/'):] + else: + repo_type = 'model' + + matching = _RE_IR_PATH.fullmatch(path) + if matching: + if matching.group('repo_id') not in _irregular_repos()[repo_type]: + matching = None + if not matching: + matching = _RE_PATH.fullmatch(path) + + if matching: + repo_id = matching.group('repo_id') + revision = matching.group('revision') or None + filename = hf_normpath(matching.group('filename') or '.') + return HfFileSystemPath(repo_id, filename, repo_type, revision) + else: + raise ValueError(f'Invalid huggingface filesystem path - {origin_path!r}.') diff --git a/requirements.txt b/requirements.txt index 1024267b78..6ca723a5ba 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ tqdm requests click>=7 tzlocal +natsort \ No newline at end of file diff --git a/test/operate/test_base.py b/test/operate/test_base.py index cd9d7ddf81..2b4388df70 100644 --- a/test/operate/test_base.py +++ b/test/operate/test_base.py @@ -21,6 +21,16 @@ def test_list_files_in_repository(self): assert (set(should_exists) & set(files)) == set(should_exists) assert not (set(should_not_exists) & set(files)) + def test_list_files_in_repository_revision(self): + files = list_files_in_repository( + repo_id='narugo/test_ds_repo', + repo_type='dataset', + revision='another_branch', + ) + should_exists = ['cloc.sh', 'raw_text', 'surtr_dataset.zip', 'surtr_dataset.zip_x'] + assert (set(should_exists) & set(files)) == set(should_exists) + assert not (set(should_not_exists) & set(files)) + def test_list_files_in_repository_no_ignore(self): files = list_files_in_repository('deepghs/highres_datasets', ignore_patterns=[]) should_exists = [ diff --git a/test/utils/test_path.py b/test/utils/test_path.py new file mode 100644 index 0000000000..6a4a8ba995 --- /dev/null +++ b/test/utils/test_path.py @@ -0,0 +1,86 @@ +import os.path + +import pytest + +from hfutils.utils import hf_normpath, hf_fs_path, parse_hf_fs_path, HfFileSystemPath + + +@pytest.mark.unittest +class TestUtilsPath: + def test_hf_normpath(self): + assert hf_normpath('./1/2/3') == '1/2/3' + assert hf_normpath('1/../2/3') == '2/3' + assert hf_normpath('1///3') == '1/3' + assert hf_normpath('1\\2/3') == '1/2/3' + assert hf_normpath(os.path.join('1', '..', '2', '3', '4')) == '2/3/4' + + def test_hf_fs_path(self): + assert hf_fs_path( + repo_id='narugo/test_ds_repo', + filename='1/2\\3' + ) == 'datasets/narugo/test_ds_repo/1/2/3' + assert hf_fs_path( + repo_id='narugo/test_ds_repo', + filename='1/2\\3', + revision='main', + ) == 'datasets/narugo/test_ds_repo@main/1/2/3' + assert hf_fs_path( + repo_id='narugo/test_ds_repo', + repo_type='model', + filename='1/2\\3', + revision='r3', + ) == 'narugo/test_ds_repo@r3/1/2/3' + assert hf_fs_path( + repo_id='narugo/test_ds_repo', + repo_type='space', + filename='1/2\\3', + revision='r3', + ) == 'spaces/narugo/test_ds_repo@r3/1/2/3' + + def test_parse_hf_fs_path(self): + assert parse_hf_fs_path('datasets/narugo/test_ds_repo/1/2/3') == HfFileSystemPath( + repo_id='narugo/test_ds_repo', + filename='1/2/3', + revision=None, + repo_type='dataset', + ) + assert parse_hf_fs_path('datasets/narugo/test_ds_repo@main/1/2/3') == HfFileSystemPath( + repo_id='narugo/test_ds_repo', + filename='1/2/3', + revision='main', + repo_type='dataset', + ) + assert parse_hf_fs_path('narugo/test_ds_repo@r3/1/2/3') == HfFileSystemPath( + repo_id='narugo/test_ds_repo', + repo_type='model', + filename='1/2/3', + revision='r3', + ) + assert parse_hf_fs_path('spaces/narugo/test_ds_repo@r3/1/2/3') == HfFileSystemPath( + repo_id='narugo/test_ds_repo', + repo_type='space', + filename='1/2/3', + revision='r3', + ) + assert parse_hf_fs_path('datasets/imagenet-1k/classes.py') == HfFileSystemPath( + repo_id='imagenet-1k', + repo_type='dataset', + filename='classes.py', + revision=None, + ) + assert parse_hf_fs_path('datasets/imagenet-1k@main/classes.py') == HfFileSystemPath( + repo_id='imagenet-1k', + repo_type='dataset', + filename='classes.py', + revision='main', + ) + assert parse_hf_fs_path('datasets/narugo/test_ds_repo') == HfFileSystemPath( + repo_id='narugo/test_ds_repo', + filename='.', + revision=None, + repo_type='dataset', + ) + + def test_parse_hf_fs_path_invalid(self): + with pytest.raises(ValueError): + _ = parse_hf_fs_path('datasets/narugo/test_ds_repo@@main/classes.py') diff --git a/tools/__init__.py b/tools/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tools/irregular_repo.py b/tools/irregular_repo.py new file mode 100644 index 0000000000..b94aa4e590 --- /dev/null +++ b/tools/irregular_repo.py @@ -0,0 +1,49 @@ +import json +import logging +import os.path + +from hbutils.string import plural_word +from natsort import natsorted + +from hfutils.operate import get_hf_client +from hfutils.utils import tqdm + + +def main(): + hf_client = get_hf_client() + + logging.info('Scanning datasets') + ir_datasets = [] + for item in tqdm(hf_client.list_datasets(), desc='Hf Datasets'): + if item.id.count('/') != 1: + ir_datasets.append(item.id) + ir_datasets = natsorted(set(ir_datasets)) + logging.info(f'{plural_word(len(ir_datasets), "irregular dataset")} found.') + + ir_models = [] + for item in tqdm(hf_client.list_models(), desc='Hf Models'): + if item.id.count('/') != 1: + ir_models.append(item.id) + ir_models = natsorted(set(ir_models)) + logging.info(f'{plural_word(len(ir_models), "irregular model")} found.') + + ir_spaces = [] + for item in tqdm(hf_client.list_spaces(), desc='Hf Spaces'): + if item.id.count('/') != 1: + ir_spaces.append(item.id) + ir_spaces = natsorted(set(ir_spaces)) + logging.info(f'{plural_word(len(ir_spaces), "irregular space")} found.') + + target = os.path.join('hfutils', 'utils', 'irregular_repo.json') + logging.info(f'Saving to {target!r} ...') + with open(target, 'w') as f: + json.dump({ + 'datasets': ir_datasets, + 'models': ir_models, + 'spaces': ir_spaces, + }, f, sort_keys=True, ensure_ascii=False, indent=4) + + +if __name__ == '__main__': + logging.basicConfig(level=logging.INFO) + main()