From d2f0ba704ace8ead3684e57fa666010d98e9d6c8 Mon Sep 17 00:00:00 2001 From: narugo1992 Date: Fri, 2 Aug 2024 06:04:20 +0800 Subject: [PATCH] dev(narugo): add docs --- docs/source/api_doc/index/index.rst | 1 + docs/source/api_doc/index/local_fetch.rst | 51 ++++++++ hfutils/index/local_fetch.py | 146 ++++++++++++++++++++++ 3 files changed, 198 insertions(+) create mode 100644 docs/source/api_doc/index/local_fetch.rst diff --git a/docs/source/api_doc/index/index.rst b/docs/source/api_doc/index/index.rst index b1fb93080b..bb273e1394 100644 --- a/docs/source/api_doc/index/index.rst +++ b/docs/source/api_doc/index/index.rst @@ -10,6 +10,7 @@ hfutils.index :maxdepth: 3 fetch + local_fetch make validate diff --git a/docs/source/api_doc/index/local_fetch.rst b/docs/source/api_doc/index/local_fetch.rst new file mode 100644 index 0000000000..208dd71a52 --- /dev/null +++ b/docs/source/api_doc/index/local_fetch.rst @@ -0,0 +1,51 @@ +hfutils.index.local_fetch +================================ + +.. currentmodule:: hfutils.index.local_fetch + +.. automodule:: hfutils.index.local_fetch + + + +tar_get_index +---------------------------------------------- + +.. autofunction:: tar_get_index + + + +tar_list_files +---------------------------------------------- + +.. autofunction:: tar_list_files + + + +tar_file_exists +---------------------------------------------- + +.. autofunction:: tar_file_exists + + + +tar_file_size +---------------------------------------------- + +.. autofunction:: tar_file_size + + + +tar_file_info +---------------------------------------------- + +.. autofunction:: tar_file_info + + + +tar_file_download +---------------------------------------------- + +.. autofunction:: tar_file_download + + + diff --git a/hfutils/index/local_fetch.py b/hfutils/index/local_fetch.py index 31fb7c239f..4d86fbb5b2 100644 --- a/hfutils/index/local_fetch.py +++ b/hfutils/index/local_fetch.py @@ -1,9 +1,42 @@ +""" +This module provides utility functions for working with tar archives and their associated index files. +It includes functions for retrieving archive indexes, listing files, checking file existence, +getting file information, and downloading files from archives. + +The module relies on a JSON-based index file that contains metadata about the files within the archive, +including their offsets, sizes, and optional SHA256 hashes. + +Functions in this module are designed to work with both local archive files and their corresponding +index files, providing a convenient interface for archive manipulation and file extraction. +""" + import json import os from typing import Optional, List def tar_get_index(archive_file: str, idx_file: Optional[str] = None): + """ + Retrieve the index data for a given tar archive file. + + This function reads the JSON index file associated with the archive, + which contains metadata about the files within the archive. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: The parsed JSON data from the index file. + :rtype: dict + + :raises FileNotFoundError: If the index file is not found. + :raises json.JSONDecodeError: If the index file is not valid JSON. + + :example: + >>> index_data = tar_get_index('my_archive.tar') + """ body, _ = os.path.splitext(archive_file) default_index_file = f'{body}.json' with open(idx_file or default_index_file, 'r') as f: @@ -11,6 +44,26 @@ def tar_get_index(archive_file: str, idx_file: Optional[str] = None): def tar_list_files(archive_file: str, idx_file: Optional[str] = None) -> List[str]: + """ + List all files contained within the specified tar archive. + + This function uses the archive's index file to retrieve the list of files + without actually reading the tar archive itself. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: A list of file names contained in the archive. + :rtype: List[str] + + :example: + >>> files = tar_list_files('my_archive.tar') + >>> for file in files: + >>> print(file) + """ index_data = tar_get_index( archive_file=archive_file, idx_file=idx_file, @@ -19,6 +72,28 @@ def tar_list_files(archive_file: str, idx_file: Optional[str] = None) -> List[st def tar_file_exists(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> bool: + """ + Check if a specific file exists within the tar archive. + + This function uses the archive's index to check for file existence + without reading the entire archive. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param file_in_archive: The name of the file to check for in the archive. + :type file_in_archive: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: True if the file exists in the archive, False otherwise. + :rtype: bool + + :example: + >>> exists = tar_file_exists('my_archive.tar', 'path/to/file.txt') + >>> if exists: + >>> print("File exists in the archive") + """ from .fetch import _hf_files_process, _n_path index = tar_get_index( archive_file=archive_file, @@ -29,6 +104,29 @@ def tar_file_exists(archive_file: str, file_in_archive: str, idx_file: Optional[ def tar_file_info(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> dict: + """ + Retrieve information about a specific file within the tar archive. + + This function returns a dictionary containing metadata about the specified file, + such as its size and offset within the archive. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param file_in_archive: The name of the file to get information for. + :type file_in_archive: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: A dictionary containing file metadata. + :rtype: dict + + :raises FileNotFoundError: If the specified file is not found in the archive. + + :example: + >>> info = tar_file_info('my_archive.tar', 'path/to/file.txt') + >>> print(f"File size: {info['size']} bytes") + """ from .fetch import _hf_files_process, _n_path index = tar_get_index( archive_file=archive_file, @@ -43,6 +141,28 @@ def tar_file_info(archive_file: str, file_in_archive: str, idx_file: Optional[st def tar_file_size(archive_file: str, file_in_archive: str, idx_file: Optional[str] = None) -> int: + """ + Get the size of a specific file within the tar archive. + + This function returns the size of the specified file in bytes. + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param file_in_archive: The name of the file to get the size for. + :type file_in_archive: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + + :return: The size of the file in bytes. + :rtype: int + + :raises FileNotFoundError: If the specified file is not found in the archive. + + :example: + >>> size = tar_file_size('my_archive.tar', 'path/to/file.txt') + >>> print(f"File size: {size} bytes") + """ return tar_file_info( archive_file=archive_file, file_in_archive=file_in_archive, @@ -52,6 +172,32 @@ def tar_file_size(archive_file: str, file_in_archive: str, idx_file: Optional[st def tar_file_download(archive_file: str, file_in_archive: str, local_file: str, idx_file: Optional[str] = None, chunk_size: int = 1 << 20): + """ + Extract and download a specific file from the tar archive to a local file. + + This function reads the specified file from the archive and writes it to a local file. + It also performs integrity checks to ensure the downloaded file is complete and matches + the expected hash (if provided in the index). + + :param archive_file: Path to the tar archive file. + :type archive_file: str + :param file_in_archive: The name of the file to extract from the archive. + :type file_in_archive: str + :param local_file: The path where the extracted file should be saved. + :type local_file: str + :param idx_file: Optional path to the index file. If not provided, + it will be inferred from the archive file name. + :type idx_file: Optional[str] + :param chunk_size: The size of chunks to read and write, in bytes. Default is 1MB. + :type chunk_size: int + + :raises FileNotFoundError: If the specified file is not found in the archive. + :raises ArchiveStandaloneFileIncompleteDownload: If the downloaded file size doesn't match the expected size. + :raises ArchiveStandaloneFileHashNotMatch: If the SHA256 hash of the downloaded file doesn't match the expected hash. + + :example: + >>> tar_file_download('my_archive.tar', 'path/to/file.txt', 'local_file.txt') + """ from .fetch import _hf_files_process, _n_path, _f_sha256, \ ArchiveStandaloneFileIncompleteDownload, ArchiveStandaloneFileHashNotMatch