From d5537ac1a4e2d1ff2d1176df41fe44897f5a22f9 Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Wed, 27 Nov 2024 20:21:31 +0800 Subject: [PATCH 1/5] dev(narugo): save new upload code --- hfutils/entry/upload.py | 5 ++- hfutils/operate/upload.py | 89 ++++++++++++++++++++++++++++++++------- hfutils/utils/arrange.py | 10 +++-- 3 files changed, 85 insertions(+), 19 deletions(-) diff --git a/hfutils/entry/upload.py b/hfutils/entry/upload.py index 7ddf4dcacf7..fccf5d554b4 100644 --- a/hfutils/entry/upload.py +++ b/hfutils/entry/upload.py @@ -76,11 +76,13 @@ def _add_upload_subcommand(cli: click.Group) -> click.Group: help='Wildcard for files to download. Only applied when -d is used.', show_default=True) @click.option('-m', '--message', 'message', type=str, default=None, help='Commit message for this operation.', show_default=True) + @click.option('-s', '--max_size_per_pack', 'max_size_per_pack', type=str, default=None, + help='Max size per archive packages, only applied when -a is assigned.', show_default=True) @command_wrap() def upload(repo_id: str, repo_type: RepoTypeTyping, file_in_repo: Optional[str], archive_in_repo: Optional[str], dir_in_repo: Optional[str], input_path: str, revision: str, clear: bool, private: bool, public: bool, wildcard: Optional[str], - message: Optional[str]): + message: Optional[str], max_size_per_pack: Optional[str]): """ Upload data to HuggingFace repositories. @@ -169,6 +171,7 @@ def upload(repo_id: str, repo_type: RepoTypeTyping, revision=revision, pattern=wildcard, silent=False, + max_size_per_pack=max_size_per_pack, message=message, ) diff --git a/hfutils/operate/upload.py b/hfutils/operate/upload.py index 48d96e46672..d46195f7da3 100644 --- a/hfutils/operate/upload.py +++ b/hfutils/operate/upload.py @@ -10,14 +10,15 @@ import os.path import re import time -from typing import Optional, List +from typing import Optional, List, Union from hbutils.string import plural_word from huggingface_hub import CommitOperationAdd, CommitOperationDelete from .base import RepoTypeTyping, get_hf_client, list_files_in_repository, _IGNORE_PATTERN_UNSET -from ..archive import get_archive_type, archive_pack -from ..utils import walk_files, TemporaryDirectory, tqdm +from ..archive import get_archive_type, archive_pack, archive_writer +from ..config.meta import __VERSION__ +from ..utils import walk_files, TemporaryDirectory, tqdm, walk_files_with_groups, FilesGroup, hf_normpath def upload_file_to_file(local_file, repo_id: str, file_in_repo: str, @@ -53,13 +54,15 @@ def upload_file_to_file(local_file, repo_id: str, file_in_repo: str, path_or_fileobj=local_file, path_in_repo=file_in_repo, revision=revision, - commit_message=message, + commit_message=message or f'Upload file {hf_normpath(file_in_repo)!r} with hfutils v{__VERSION__}', ) def upload_directory_as_archive(local_directory, repo_id: str, archive_in_repo: str, pattern: Optional[str] = None, repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', message: Optional[str] = None, silent: bool = False, + group_method: Optional[Union[str, int]] = None, + max_size_per_pack: Optional[Union[str, float]] = None, hf_token: Optional[str] = None): """ Upload a local directory as an archive file to a specified path in a Hugging Face repository. @@ -80,6 +83,12 @@ def upload_directory_as_archive(local_directory, repo_id: str, archive_in_repo: :type message: Optional[str] :param silent: If True, suppress progress bar output. :type silent: bool + :param group_method: Method for grouping files (None for default, int for segment count). + Only applied when ``max_total_size`` is assigned. + :type group_method: Optional[Union[str, int]] + :param max_size_per_pack: Maximum total size for each group (can be string like "1GB"). + When assigned, this function will try to upload with multiple archive files. + :type max_size_per_pack: Optional[Union[str, float]] :param hf_token: Huggingface token for API client, use ``HF_TOKEN`` variable if not assigned. :type hf_token: str, optional @@ -91,16 +100,66 @@ def upload_directory_as_archive(local_directory, repo_id: str, archive_in_repo: """ archive_type = get_archive_type(archive_in_repo) with TemporaryDirectory() as td: - local_archive_file = os.path.join(td, os.path.basename(archive_in_repo)) - archive_pack( - type_name=archive_type, - directory=local_directory, - archive_file=local_archive_file, - pattern=pattern, - silent=silent, - ) - upload_file_to_file(local_archive_file, repo_id, archive_in_repo, - repo_type, revision, message, hf_token=hf_token) + if max_size_per_pack is not None: + file_groups = walk_files_with_groups( + directory=local_directory, + pattern=pattern, + group_method=group_method, + max_total_size=max_size_per_pack, + silent=silent, + ) + if len(file_groups) < 2: + file_groups = None + else: + file_groups = None + + if file_groups is None: + local_archive_file = os.path.join(td, os.path.basename(archive_in_repo)) + archive_pack( + type_name=archive_type, + directory=local_directory, + archive_file=local_archive_file, + pattern=pattern, + silent=silent, + ) + upload_file_to_file( + repo_id=repo_id, + repo_type=repo_type, + local_file=local_archive_file, + file_in_repo=archive_in_repo, + revision=revision, + message=message or f'Upload archive {hf_normpath(archive_in_repo)!r} with hfutils v{__VERSION__}', + hf_token=hf_token + ) + + else: + id_pattern = f'{{x:0{max(len(str(len(file_groups))), 5)}d}}' + raw_dst_archive_file = os.path.normpath(os.path.join(td, archive_in_repo)) + for gid, group in enumerate(file_groups, start=1): + group: FilesGroup + dst_archive_file_body, dst_archive_file_ext = os.path.splitext(raw_dst_archive_file) + dst_archive_file = (f'{dst_archive_file_body}' + f'-{id_pattern.format(x=gid)}-of-{id_pattern.format(x=len(file_groups))}' + f'{dst_archive_file_ext}') + os.makedirs(os.path.dirname(dst_archive_file), exist_ok=True) + with archive_writer(type_name=archive_type, archive_file=dst_archive_file) as af, \ + tqdm(group.files, silent=silent, + desc=f'Packing {local_directory!r} #{gid}/{len(file_groups)} ...') as progress: + for file in progress: + progress.set_description(file) + af.add(os.path.join(local_directory, file), file) + + upload_directory_as_directory( + repo_id=repo_id, + repo_type=repo_type, + local_directory=td, + path_in_repo='.', + revision=revision, + message=message or f'Upload archive {hf_normpath(archive_in_repo)!r} ' + f'({plural_word(len(file_groups), "packs")}) ' + f'with hfutils v{__VERSION__}', + hf_token=hf_token, + ) _PATH_SEP = re.compile(r'[/\\]+') @@ -196,7 +255,7 @@ def upload_directory_as_directory( )) current_time = datetime.datetime.now().astimezone().strftime('%Y-%m-%d %H:%M:%S %Z') - commit_message = message or f'Upload directory {os.path.basename(os.path.abspath(local_directory))!r}' + commit_message = message or f'Upload directory {hf_normpath(path_in_repo)!r} with hfutils v{__VERSION__}' if time_suffix: commit_message = f'{commit_message}, on {current_time}' diff --git a/hfutils/utils/arrange.py b/hfutils/utils/arrange.py index 8dd1946878a..11c53474afa 100644 --- a/hfutils/utils/arrange.py +++ b/hfutils/utils/arrange.py @@ -19,6 +19,7 @@ from hbutils.scale import size_to_bytes from natsort import natsorted +from tqdm import tqdm from .heap import Heap from .walk import walk_files @@ -199,7 +200,8 @@ def _group_by(files: List[FileItem], group_method: Optional[Union[str, int]] = N def walk_files_with_groups(directory: str, pattern: Optional[str] = None, group_method: Optional[Union[str, int]] = None, - max_total_size: Optional[Union[str, float]] = None) \ + max_total_size: Optional[Union[str, float]] = None, + silent: bool = False) \ -> List[FilesGroup]: """ Walk through a directory and group files based on specified criteria. @@ -215,6 +217,8 @@ def walk_files_with_groups(directory: str, pattern: Optional[str] = None, :type group_method: Optional[Union[str, int]] :param max_total_size: Maximum total size for each group (can be string like "1GB") :type max_total_size: Optional[Union[str, float]] + :param silent: If True, the progress bar content will not be displayed. + :type silent: bool :return: List of file groups :rtype: List[FilesGroup] @@ -228,7 +232,7 @@ def walk_files_with_groups(directory: str, pattern: Optional[str] = None, """ all_items = [ FileItem.from_file(os.path.join(directory, file), rel_to=directory) - for file in walk_files(directory, pattern=pattern) + for file in tqdm(walk_files(directory, pattern=pattern), desc=f'Scanning {directory!r} ...', disable=silent) ] if max_total_size is not None and isinstance(max_total_size, str): max_total_size = size_to_bytes(max_total_size) @@ -242,7 +246,7 @@ def walk_files_with_groups(directory: str, pattern: Optional[str] = None, raw_groups: List[Union[FileItem, FilesGroup]] = _group_by(all_items, group_method=group_method) collected_groups: List[FilesGroup] = [] heap: Heap[FilesGroup] = Heap(key=lambda x: (x.size, x.count)) - for group in raw_groups: + for group in tqdm(raw_groups, desc='Arranging Files', disable=silent): if not heap or (heap.peek().size + group.size) > max_total_size: new_group = FilesGroup.new() heap.push(new_group) From 646e95329e7aabed83180db3d4fa4fd36781c035 Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Wed, 27 Nov 2024 20:33:23 +0800 Subject: [PATCH 2/5] dev(narugo): x --- docs/source/api_doc/utils/ext.rst | 15 ++++++++ docs/source/api_doc/utils/index.rst | 1 + hfutils/operate/upload.py | 1 - hfutils/utils/__init__.py | 1 + hfutils/utils/ext.py | 40 +++++++++++++++++++++ test/utils/test_ext.py | 56 +++++++++++++++++++++++++++++ 6 files changed, 113 insertions(+), 1 deletion(-) create mode 100644 docs/source/api_doc/utils/ext.rst create mode 100644 hfutils/utils/ext.py create mode 100644 test/utils/test_ext.py diff --git a/docs/source/api_doc/utils/ext.rst b/docs/source/api_doc/utils/ext.rst new file mode 100644 index 00000000000..b193301a100 --- /dev/null +++ b/docs/source/api_doc/utils/ext.rst @@ -0,0 +1,15 @@ +hfutils.utils.ext +================================= + +.. currentmodule:: hfutils.utils.ext + +.. automodule:: hfutils.utils.ext + + +splitext_with_composite +------------------------------------------------- + +.. autofunction:: splitext_with_composite + + + diff --git a/docs/source/api_doc/utils/index.rst b/docs/source/api_doc/utils/index.rst index d469027ee72..fac7a21b4fd 100644 --- a/docs/source/api_doc/utils/index.rst +++ b/docs/source/api_doc/utils/index.rst @@ -13,6 +13,7 @@ hfutils.utils arrange binary data + ext heap download model diff --git a/hfutils/operate/upload.py b/hfutils/operate/upload.py index d46195f7da3..e135cb35e5f 100644 --- a/hfutils/operate/upload.py +++ b/hfutils/operate/upload.py @@ -146,7 +146,6 @@ def upload_directory_as_archive(local_directory, repo_id: str, archive_in_repo: tqdm(group.files, silent=silent, desc=f'Packing {local_directory!r} #{gid}/{len(file_groups)} ...') as progress: for file in progress: - progress.set_description(file) af.add(os.path.join(local_directory, file), file) upload_directory_as_directory( diff --git a/hfutils/utils/__init__.py b/hfutils/utils/__init__.py index a67b318bd98..522fa5318d2 100644 --- a/hfutils/utils/__init__.py +++ b/hfutils/utils/__init__.py @@ -3,6 +3,7 @@ from .binary import is_binary_file from .data import is_data_file from .download import download_file +from .ext import splitext_with_composite from .heap import Heap from .logging import ColoredFormatter from .number import number_to_tag diff --git a/hfutils/utils/ext.py b/hfutils/utils/ext.py new file mode 100644 index 00000000000..0b0227e49f0 --- /dev/null +++ b/hfutils/utils/ext.py @@ -0,0 +1,40 @@ +""" +This module provides utilities for handling file extensions, particularly for files with composite extensions. +It extends the standard os.path.splitext functionality to support multi-part file extensions. +""" + +import os + + +def splitext_with_composite(filename, composite_extensions): + """ + Split a filename into a pair (root, ext) where ext is a composite extension if it matches + one of the provided composite extensions, otherwise behaves like os.path.splitext. + + This function is particularly useful when dealing with files that have multi-part extensions + (e.g., '.tar.gz', '.config.json') where standard os.path.splitext would only split at the + last dot. + + :param filename: The filename to split. + :type filename: str + :param composite_extensions: A sequence of composite extensions to check against (e.g., ['.tar.gz', '.config.json']). + The matching is case-insensitive. + :type composite_extensions: list[str] or tuple[str] + + :return: A tuple containing the root and the extension. If the filename ends with any of the + composite extensions, the extension will be the full composite extension. Otherwise, + returns the result of os.path.splitext. + :rtype: tuple[str, str] + + :example: + >>> splitext_with_composite('file.tar.gz', ['.tar.gz']) + ('file', '.tar.gz') + >>> splitext_with_composite('file.config.json', ['.config.json']) + ('file', '.config.json') + >>> splitext_with_composite('file.txt', ['.tar.gz']) + ('file', '.txt') + """ + for ext in composite_extensions: + if filename.lower().endswith(ext.lower()): + return filename[:-len(ext)], filename[-len(ext):] + return os.path.splitext(filename) diff --git a/test/utils/test_ext.py b/test/utils/test_ext.py new file mode 100644 index 00000000000..b956ba72de1 --- /dev/null +++ b/test/utils/test_ext.py @@ -0,0 +1,56 @@ +import pytest + +from hfutils.utils import splitext_with_composite + + +@pytest.mark.unittest +class TestSplitextWithComposite: + @pytest.mark.parametrize("filename,composite_extensions,expected", [ + ("test.txt", [".txt"], ("test", ".txt")), + ("test.pdf", [".pdf"], ("test", ".pdf")), + ("test.doc", [".doc"], ("test", ".doc")), + ("test.py", [".py"], ("test", ".py")), + + ("archive.tar.gz", [".tar.gz"], ("archive", ".tar.gz")), + ("data.tar.bz2", [".tar.bz2"], ("data", ".tar.bz2")), + ("video.tar.xz", [".tar.xz"], ("video", ".tar.xz")), + ("backup.tar.7z", [".tar.7z"], ("backup", ".tar.7z")), + + ("test.tar.gz", [".tar.bz2", ".tar.gz"], ("test", ".tar.gz")), + ("test.tar.bz2", [".tar.gz", ".tar.bz2"], ("test", ".tar.bz2")), + + ("TEST.TXT", [".txt"], ("TEST", ".TXT")), + ("Test.PDF", [".pdf"], ("Test", ".PDF")), + ("test.TAR.GZ", [".tar.gz"], ("test", ".TAR.GZ")), + + ("noextension", [], ("noextension", "")), + ("noextension", [".txt"], ("noextension", "")), + ("justname", [".tar.gz", ".txt"], ("justname", "")), + + (".hidden", [], (".hidden", "")), + (".hidden.txt", [".txt"], (".hidden", ".txt")), + (".config.json", [".json"], (".config", ".json")), + + ("my.test.file.txt", [".txt"], ("my.test.file", ".txt")), + ("data.backup.tar.gz", [".tar.gz"], ("data.backup", ".tar.gz")), + + ("test-file.txt", [".txt"], ("test-file", ".txt")), + ("test_file.tar.gz", [".tar.gz"], ("test_file", ".tar.gz")), + ("test space.pdf", [".pdf"], ("test space", ".pdf")), + + ("test.txt", [], ("test", ".txt")), + ("test.tar.gz", [], ("test.tar", ".gz")), + + ("test.xyz", [".txt", ".pdf", ".tar.gz"], ("test", ".xyz")), + ("test.abc", [".def", ".ghi"], ("test", ".abc")), + + ("测试.txt", [".txt"], ("测试", ".txt")), + ("тест.tar.gz", [".tar.gz"], ("тест", ".tar.gz")), + + (".", [], (".", "")), + ("..", [], ("..", "")), + ("..txt", [".txt"], (".", ".txt")) + ]) + def test_splitext_with_composite(self, filename, composite_extensions, expected): + result = splitext_with_composite(filename, composite_extensions) + assert result == expected From 2bc860bce9f41011f94ddd305fc894e94e5137ca Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Wed, 27 Nov 2024 20:37:03 +0800 Subject: [PATCH 3/5] dev(narugo): add splitext --- hfutils/archive/__init__.py | 2 +- hfutils/archive/base.py | 15 +++++++++++++++ hfutils/operate/upload.py | 4 ++-- 3 files changed, 18 insertions(+), 3 deletions(-) diff --git a/hfutils/archive/__init__.py b/hfutils/archive/__init__.py index e9e2e786f71..1a181dfbd19 100644 --- a/hfutils/archive/__init__.py +++ b/hfutils/archive/__init__.py @@ -18,7 +18,7 @@ The creation of archive files in the RAR format is not supported, as we utilize the `rarfile `_ library, which does not offer functionality for creating RAR files. """ from .base import register_archive_type, archive_pack, archive_unpack, get_archive_type, get_archive_extname, \ - archive_writer, ArchiveWriter + archive_writer, ArchiveWriter, archive_splitext from .rar import _rar_pack, _rar_unpack, RARWriter from .sevenz import _7z_pack, _7z_unpack, SevenZWriter from .tar import _tarfile_pack, _tarfile_unpack, TarWriter diff --git a/hfutils/archive/base.py b/hfutils/archive/base.py index 4d3d2b667cf..ab84fc894db 100644 --- a/hfutils/archive/base.py +++ b/hfutils/archive/base.py @@ -12,8 +12,11 @@ import os.path import warnings +from functools import lru_cache from typing import List, Dict, Tuple, Callable, Optional +from hfutils.utils import splitext_with_composite + class ArchiveWriter: """ @@ -272,3 +275,15 @@ def archive_writer(type_name: str, archive_file: str) -> ArchiveWriter: f'We strongly recommend using a regular extension name for the archive file.') return fn_writer(archive_file) + + +@lru_cache() +def _get_all_extensions(): + extensions = [] + for type_name, (exts, _, _, _) in _KNOWN_ARCHIVE_TYPES.items(): + extensions.extend(exts) + return extensions + + +def archive_splitext(filename: str) -> Tuple[str, str]: + return splitext_with_composite(filename, _get_all_extensions()) diff --git a/hfutils/operate/upload.py b/hfutils/operate/upload.py index e135cb35e5f..bc64a0b3540 100644 --- a/hfutils/operate/upload.py +++ b/hfutils/operate/upload.py @@ -16,7 +16,7 @@ from huggingface_hub import CommitOperationAdd, CommitOperationDelete from .base import RepoTypeTyping, get_hf_client, list_files_in_repository, _IGNORE_PATTERN_UNSET -from ..archive import get_archive_type, archive_pack, archive_writer +from ..archive import get_archive_type, archive_pack, archive_writer, archive_splitext from ..config.meta import __VERSION__ from ..utils import walk_files, TemporaryDirectory, tqdm, walk_files_with_groups, FilesGroup, hf_normpath @@ -137,7 +137,7 @@ def upload_directory_as_archive(local_directory, repo_id: str, archive_in_repo: raw_dst_archive_file = os.path.normpath(os.path.join(td, archive_in_repo)) for gid, group in enumerate(file_groups, start=1): group: FilesGroup - dst_archive_file_body, dst_archive_file_ext = os.path.splitext(raw_dst_archive_file) + dst_archive_file_body, dst_archive_file_ext = archive_splitext(raw_dst_archive_file) dst_archive_file = (f'{dst_archive_file_body}' f'-{id_pattern.format(x=gid)}-of-{id_pattern.format(x=len(file_groups))}' f'{dst_archive_file_ext}') From 32b8e65cb3c1828d4bc1c2f1d5dbe76dac45c5a5 Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Wed, 27 Nov 2024 20:43:53 +0800 Subject: [PATCH 4/5] dev(narugo): complete this part --- docs/source/api_doc/archive/index.rst | 8 ++ hfutils/archive/base.py | 180 ++++++++++++++------------ hfutils/entry/upload.py | 52 ++++---- 3 files changed, 133 insertions(+), 107 deletions(-) diff --git a/docs/source/api_doc/archive/index.rst b/docs/source/api_doc/archive/index.rst index 26328e82349..e101be71abb 100644 --- a/docs/source/api_doc/archive/index.rst +++ b/docs/source/api_doc/archive/index.rst @@ -56,3 +56,11 @@ get_archive_extname +archive_splitext +---------------------------------------- + +.. autofunction:: archive_splitext + + + + diff --git a/hfutils/archive/base.py b/hfutils/archive/base.py index ab84fc894db..c4958220a8c 100644 --- a/hfutils/archive/base.py +++ b/hfutils/archive/base.py @@ -1,9 +1,5 @@ """ -This module provides functionality for handling archive files in various formats. - -It includes functions for registering custom archive types, packing directories into archives, -unpacking archives, and determining archive types based on file extensions. The module supports -a flexible system for working with different archive formats through a registration mechanism. +Archive handling module for managing different types of archive files. .. note:: This module uses a global dictionary to store registered archive types, so it's @@ -20,13 +16,18 @@ class ArchiveWriter: """ - A base class for creating archive writers. + Base class for creating and managing archive writers. This class provides a context manager interface for handling archive files, - allowing files to be added to the archive and ensuring proper resource management. + allowing for safe resource management and consistent file addition operations. + It serves as a template for specific archive format implementations. - :param archive_file: The path to the archive file to be created or modified. + :param archive_file: Path to the archive file to be created or modified. :type archive_file: str + + Example: + >>> with ArchiveWriter('output.zip') as writer: + ... writer.add('file.txt', 'archive_path/file.txt') """ def __init__(self, archive_file: str): @@ -35,12 +36,12 @@ def __init__(self, archive_file: str): def _create_handler(self): """ - Create the handler for the archive writer. + Create the underlying archive handler. - This method should be overridden by subclasses to provide specific - handler creation logic for different archive types. + This method should be implemented by subclasses to initialize the + specific archive format handler. - :raises NotImplementedError: If not overridden in a subclass. + :raises NotImplementedError: When called on the base class. """ raise NotImplementedError # pragma: no cover @@ -48,14 +49,14 @@ def _add_file(self, filename: str, arcname: str): """ Add a file to the archive. - This method should be overridden by subclasses to define how files - are added to the archive for different formats. + This method should be implemented by subclasses to define the + specific file addition logic for each archive format. - :param filename: The path to the file to add to the archive. + :param filename: Path to the file to be added. :type filename: str - :param arcname: The archive name for the file. + :param arcname: Desired path within the archive. :type arcname: str - :raises NotImplementedError: If not overridden in a subclass. + :raises NotImplementedError: When called on the base class. """ raise NotImplementedError # pragma: no cover @@ -63,7 +64,8 @@ def open(self): """ Open the archive for writing. - Initializes the handler if it has not been created yet. + Initializes the archive handler if it hasn't been created yet. + This method is automatically called when using the context manager. """ if self._handler is None: self._handler = self._create_handler() @@ -72,18 +74,19 @@ def add(self, filename: str, arcname: str): """ Add a file to the archive. - :param filename: The path to the file to add. + :param filename: Path to the file to be added. :type filename: str - :param arcname: The name to use for the file within the archive. + :param arcname: Desired path within the archive. :type arcname: str """ return self._add_file(filename, arcname) def close(self): """ - Close the archive. + Close the archive and release resources. - Ensures that all resources are properly released. + This method ensures proper cleanup of resources and is automatically + called when using the context manager. """ if self._handler is not None: self._handler.close() @@ -91,18 +94,23 @@ def close(self): def __enter__(self): """ - Enter the runtime context related to this object. + Context manager entry point. - Opens the archive for writing. + :return: Self reference for use in context manager. + :rtype: ArchiveWriter """ self.open() return self def __exit__(self, exc_type, exc_val, exc_tb): """ - Exit the runtime context related to this object. + Context manager exit point. - Closes the archive, ensuring that resources are released. + Ensures proper cleanup of resources when exiting the context. + + :param exc_type: Exception type if an error occurred. + :param exc_val: Exception value if an error occurred. + :param exc_tb: Exception traceback if an error occurred. """ self.close() @@ -113,31 +121,28 @@ def __exit__(self, exc_type, exc_val, exc_tb): def register_archive_type(name: str, exts: List[str], fn_pack: Callable, fn_unpack: Callable, fn_writer: _FN_WRITER): """ - Register a custom archive type with associated file extensions and packing/unpacking functions. + Register a new archive type with its associated handlers and extensions. - This function allows users to add support for new archive types by providing the necessary - information and functions to handle the archive format. + This function allows for the registration of custom archive formats by providing + the necessary functions for packing, unpacking, and creating archive writers. - :param name: The name of the archive type (e.g., 'zip', 'tar'). + :param name: Identifier for the archive type (e.g., 'zip', 'tar'). :type name: str - :param exts: A list of file extensions associated with the archive type (e.g., ['.zip']). + :param exts: List of file extensions for this archive type (e.g., ['.zip']). :type exts: List[str] - :param fn_pack: The packing function that takes a directory and an archive filename as input and creates an archive. + :param fn_pack: Function to create archives of this type. :type fn_pack: Callable - :param fn_unpack: The unpacking function that takes an archive filename and a directory as input and extracts the archive. + :param fn_unpack: Function to extract archives of this type. :type fn_unpack: Callable - :param fn_writer: The writer creation function that takes an archive filename and creates an archive writer object. + :param fn_writer: Function to create an archive writer instance. :type fn_writer: Callable[[str], ArchiveWriter] - :raises ValueError: If no file extensions are provided for the archive type. + :raises ValueError: If no file extensions are provided. Example: - >>> def custom_pack(directory, archive_file, **kwargs): - ... # Custom packing logic here - ... pass - >>> def custom_unpack(archive_file, directory, **kwargs): - ... # Custom unpacking logic here - ... pass - >>> register_archive_type('custom', ['.cst'], custom_pack, custom_unpack) + >>> def my_pack(directory, archive_file, **kwargs): pass + >>> def my_unpack(archive_file, directory, **kwargs): pass + >>> def my_writer(archive_file): return CustomWriter(archive_file) + >>> register_archive_type('custom', ['.cst'], my_pack, my_unpack, my_writer) """ if len(exts) == 0: raise ValueError(f'At least one extension name for archive type {name!r} should be provided.') @@ -146,18 +151,17 @@ def register_archive_type(name: str, exts: List[str], fn_pack: Callable, fn_unpa def get_archive_extname(type_name: str) -> str: """ - Get the file extension associated with a registered archive type. - - This function returns the first (primary) file extension associated with the given archive type. + Retrieve the primary file extension for a registered archive type. - :param type_name: The name of the archive type. + :param type_name: Name of the archive type. :type type_name: str - :return: The file extension associated with the archive type. + :return: Primary file extension for the archive type. :rtype: str :raises ValueError: If the archive type is not registered. Example: - >>> get_archive_extname('zip') + >>> ext = get_archive_extname('zip') + >>> print(ext) '.zip' """ if type_name in _KNOWN_ARCHIVE_TYPES: @@ -170,26 +174,24 @@ def get_archive_extname(type_name: str) -> str: def archive_pack(type_name: str, directory: str, archive_file: str, pattern: Optional[str] = None, silent: bool = False, clear: bool = False): """ - Pack a directory into an archive file using the specified archive type. + Create an archive from a directory using the specified archive type. - This function creates an archive of the specified type containing the contents of the given directory. - - :param type_name: The name of the archive type. + :param type_name: Name of the archive type to use. :type type_name: str - :param directory: The directory to pack. + :param directory: Source directory to archive. :type directory: str - :param archive_file: The filename of the resulting archive. + :param archive_file: Output archive file path. :type archive_file: str - :param pattern: A pattern to filter files for inclusion in the archive (optional). + :param pattern: Optional file pattern for filtering (e.g., '*.txt'). :type pattern: str, optional - :param silent: If True, suppress warnings during the packing process. + :param silent: Whether to suppress warnings. :type silent: bool - :param clear: If True, remove existing files when packing. + :param clear: Whether to remove existing files when packing. :type clear: bool :raises ValueError: If the archive type is not registered. Example: - >>> archive_pack('zip', '/path/to/directory', '/path/to/archive.zip', pattern='*.txt') + >>> archive_pack('zip', '/data', 'backup.zip', pattern='*.dat', silent=True) """ exts, fn_pack, _, _ = _KNOWN_ARCHIVE_TYPES[type_name] if not any(os.path.normcase(archive_file).endswith(extname) for extname in exts): @@ -202,19 +204,17 @@ def archive_pack(type_name: str, directory: str, archive_file: str, def get_archive_type(archive_file: str) -> str: """ - Determine the archive type based on the file extension. - - This function examines the file extension of the given archive file and returns the - corresponding archive type name. + Determine the archive type from a file's extension. - :param archive_file: The filename of the archive. + :param archive_file: Path to the archive file. :type archive_file: str - :return: The name of the archive type. + :return: Name of the detected archive type. :rtype: str - :raises ValueError: If the file extension is not associated with any registered archive type. + :raises ValueError: If the file extension doesn't match any registered type. Example: - >>> get_archive_type('/path/to/archive.tar.gz') + >>> type_name = get_archive_type('data.tar.gz') + >>> print(type_name) 'gztar' """ archive_file = os.path.normcase(archive_file) @@ -227,22 +227,20 @@ def get_archive_type(archive_file: str) -> str: def archive_unpack(archive_file: str, directory: str, silent: bool = False, password: Optional[str] = None): """ - Unpack an archive file into a directory using the specified archive type. + Extract an archive file to a directory. - This function extracts the contents of the given archive file into the specified directory. - - :param archive_file: The filename of the archive. + :param archive_file: Path to the archive file to extract. :type archive_file: str - :param directory: The directory to unpack the contents into. + :param directory: Destination directory for extraction. :type directory: str - :param silent: If True, suppress warnings during the unpacking process. + :param silent: Whether to suppress warnings. :type silent: bool - :param password: The password to extract the archive file (optional). + :param password: Optional password for protected archives. :type password: str, optional :raises ValueError: If the archive type is not recognized. Example: - >>> archive_unpack('/path/to/archive.zip', '/path/to/extract') + >>> archive_unpack('protected.zip', 'output_dir', password='secret') """ type_name = get_archive_type(archive_file) _, _, fn_unpack, _ = _KNOWN_ARCHIVE_TYPES[type_name] @@ -251,22 +249,19 @@ def archive_unpack(archive_file: str, directory: str, silent: bool = False, pass def archive_writer(type_name: str, archive_file: str) -> ArchiveWriter: """ - Create an ArchiveWriter instance for the specified archive type. - - This function returns an ArchiveWriter that can be used to add files to an archive. + Create an archive writer for the specified archive type. - :param type_name: The name of the archive type. + :param type_name: Name of the archive type. :type type_name: str - :param archive_file: The filename of the archive to be created or modified. + :param archive_file: Path to the archive file to create. :type archive_file: str - :return: An ArchiveWriter instance for the specified archive type. + :return: An archive writer instance. :rtype: ArchiveWriter :raises ValueError: If the archive type is not registered. Example: - >>> writer = archive_writer('zip', '/path/to/archive.zip') - >>> with writer as w: - ... w.add('/path/to/file.txt', 'file.txt') + >>> with archive_writer('zip', 'output.zip') as writer: + ... writer.add('file.txt', 'docs/file.txt') """ exts, _, _, fn_writer = _KNOWN_ARCHIVE_TYPES[type_name] if not any(os.path.normcase(archive_file).endswith(extname) for extname in exts): @@ -279,6 +274,12 @@ def archive_writer(type_name: str, archive_file: str) -> ArchiveWriter: @lru_cache() def _get_all_extensions(): + """ + Get a list of all registered archive extensions. + + :return: List of all registered file extensions. + :rtype: list + """ extensions = [] for type_name, (exts, _, _, _) in _KNOWN_ARCHIVE_TYPES.items(): extensions.extend(exts) @@ -286,4 +287,17 @@ def _get_all_extensions(): def archive_splitext(filename: str) -> Tuple[str, str]: + """ + Split a filename into root and extension, handling compound extensions. + + :param filename: The filename to split. + :type filename: str + :return: Tuple of (root, extension). + :rtype: Tuple[str, str] + + Example: + >>> root, ext = archive_splitext('data.tar.gz') + >>> print(root, ext) + 'data' '.tar.gz' + """ return splitext_with_composite(filename, _get_all_extensions()) diff --git a/hfutils/entry/upload.py b/hfutils/entry/upload.py index fccf5d554b4..600f76dffb5 100644 --- a/hfutils/entry/upload.py +++ b/hfutils/entry/upload.py @@ -7,7 +7,8 @@ Usage: This module is typically used as part of a larger CLI application for interacting - with HuggingFace repositories. + with HuggingFace repositories. It requires appropriate authentication through + HuggingFace tokens (set via environment variable HF_TOKEN). """ import warnings @@ -27,7 +28,8 @@ class NoRemotePathAssignedWithUpload(ClickErrorException): Custom exception class for indicating that no remote path in the repository is assigned. This exception is raised when attempting to upload without specifying a remote path - (file, archive, or directory) in the repository. + (file, archive, or directory) in the repository. At least one of these options must + be provided for successful upload operation. :attribute exit_code: The exit code to be used when this exception is raised. :type exit_code: int @@ -37,10 +39,11 @@ class NoRemotePathAssignedWithUpload(ClickErrorException): def _add_upload_subcommand(cli: click.Group) -> click.Group: """ - Add the 'upload' subcommand to the CLI. + Add the 'upload' subcommand to the CLI application. - This function defines and adds the 'upload' command to the provided CLI group. - It sets up all the necessary options and implements the upload functionality. + This function enhances the provided CLI group by adding a comprehensive upload command + that supports various upload scenarios to HuggingFace repositories. It configures + multiple options for fine-grained control over the upload process. :param cli: The Click CLI application to which the upload command will be added. :type cli: click.Group @@ -84,40 +87,41 @@ def upload(repo_id: str, repo_type: RepoTypeTyping, input_path: str, revision: str, clear: bool, private: bool, public: bool, wildcard: Optional[str], message: Optional[str], max_size_per_pack: Optional[str]): """ - Upload data to HuggingFace repositories. + Upload data to HuggingFace repositories with various options and modes. - This function handles the upload process to HuggingFace repositories. It supports - uploading individual files, archives, or entire directories. The function also - manages repository creation and visibility settings. + This function implements the core upload functionality, supporting multiple upload modes + and repository management features. It handles repository creation, visibility settings, + and different types of uploads (file, archive, directory). - :param repo_id: Repository to upload to. + :param repo_id: Repository identifier to upload to. :type repo_id: str - :param repo_type: Type of the HuggingFace repository. + :param repo_type: Type of the HuggingFace repository (e.g., dataset, model). :type repo_type: RepoTypeTyping - :param file_in_repo: File in repository to upload. + :param file_in_repo: Target path for single file upload in the repository. :type file_in_repo: Optional[str] - :param archive_in_repo: Archive file in repository to upload and extract from. + :param archive_in_repo: Target path for archive upload in the repository. :type archive_in_repo: Optional[str] - :param dir_in_repo: Directory in repository to upload the full directory tree. + :param dir_in_repo: Target directory path in the repository for directory upload. :type dir_in_repo: Optional[str] - :param input_path: Input path for upload. + :param input_path: Local path of the file or directory to upload. :type input_path: str - :param revision: Revision of repository. + :param revision: Repository revision/branch to upload to. :type revision: str - :param clear: Clear the remote directory before uploading. - Only applied when -d is used. + :param clear: Whether to clear existing content before directory upload. :type clear: bool - :param private: Set private repository when created. + :param private: Flag to set repository as private when created. :type private: bool - :param public: Set public repository when created. + :param public: Flag to set repository as public when created. :type public: bool - :param wildcard: Wildcard pattern for selecting files to upload. + :param wildcard: Pattern for filtering files during upload. :type wildcard: Optional[str] - :param message: Commit message for this operation. + :param message: Commit message for the upload operation. :type message: Optional[str] + :param max_size_per_pack: Maximum size limit for archive packages. + :type max_size_per_pack: Optional[str] - :raises NoRemotePathAssignedWithUpload: If no remote path in repository is assigned. - :raises ValueError: If both private and public flags are set. + :raises NoRemotePathAssignedWithUpload: If no upload mode is specified. + :raises ValueError: If conflicting visibility settings are provided. """ configure_http_backend(get_requests_session) From 337acac5f5fb71157029b2e26391206206631983 Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Wed, 27 Nov 2024 21:07:23 +0800 Subject: [PATCH 5/5] dev(narugo): complete this part --- hfutils/operate/upload.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/hfutils/operate/upload.py b/hfutils/operate/upload.py index bc64a0b3540..3d258725aab 100644 --- a/hfutils/operate/upload.py +++ b/hfutils/operate/upload.py @@ -135,7 +135,8 @@ def upload_directory_as_archive(local_directory, repo_id: str, archive_in_repo: else: id_pattern = f'{{x:0{max(len(str(len(file_groups))), 5)}d}}' raw_dst_archive_file = os.path.normpath(os.path.join(td, archive_in_repo)) - for gid, group in enumerate(file_groups, start=1): + for gid, group in enumerate(tqdm(file_groups, silent=silent, + desc=f'Making {plural_word(len(file_groups), "package")}'), start=1): group: FilesGroup dst_archive_file_body, dst_archive_file_ext = archive_splitext(raw_dst_archive_file) dst_archive_file = (f'{dst_archive_file_body}'