Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

dev(narugo): add repository analysis utils #56

Merged
merged 4 commits into from
Dec 11, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ jobs:
os:
- 'ubuntu-20.04'
- 'windows-2019'
- 'macos-12'
- 'macos-13'
python-version:
- '3.8.0'

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/release_test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
os:
- 'ubuntu-20.04'
- 'windows-2019'
- 'macos-12'
- 'macos-13'
python-version:
- '3.8.0'

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ jobs:
os:
- 'ubuntu-20.04'
- 'windows-2019'
- 'macos-12'
- 'macos-13'
python-version:
- '3.8.0'

Expand Down
1 change: 1 addition & 0 deletions docs/source/api_doc/repository/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,6 @@ hfutils.repository
base
clone
rollback
size
url

32 changes: 32 additions & 0 deletions docs/source/api_doc/repository/size.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
hfutils.repository.size
=============================================

.. currentmodule:: hfutils.repository.size

.. automodule:: hfutils.repository.size



RepoFileItem
-------------------------------------------

.. autoclass:: RepoFileItem
:members: from_repo_file,path_segments,__repr__



RepoFileList
-------------------------------------------

.. autoclass:: RepoFileList
:members: __init__,__getitem__,__len__,__repr__,total_size,repr



hf_hub_repo_analysis
---------------------------------

.. autofunction:: hf_hub_repo_analysis



1 change: 1 addition & 0 deletions hfutils/repository/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@
from .base import hf_hub_repo_url
from .clone import hf_hub_clone
from .rollback import hf_hub_rollback
from .size import RepoFileItem, RepoFileList, hf_hub_repo_analysis
from .url import hf_hub_repo_path_url, hf_hub_repo_file_url
254 changes: 254 additions & 0 deletions hfutils/repository/size.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,254 @@
"""
This module provides functionality for analyzing and managing Hugging Face repository files.

It includes classes and functions for representing repository files, creating file lists,
and analyzing repository contents. The module is designed to work with the Hugging Face
Hub API and provides utilities for handling file paths, sizes, and other metadata.

Key components:
- RepoFileItem: Represents a single file in a repository.
- RepoFileList: A sequence of RepoFileItems with additional metadata and utility methods.
- hf_hub_repo_analysis: Function to analyze repository contents based on given criteria.

This module is particularly useful for developers working with Hugging Face repositories
and need to analyze or manage file structures and metadata.
"""

import os.path
from collections.abc import Sequence
from dataclasses import dataclass
from operator import itemgetter
from typing import Optional, List, Literal, Tuple

from hbutils.scale import size_to_bytes_str
from hbutils.string import plural_word, format_tree
from huggingface_hub.hf_api import RepoFile

from ..operate import list_all_with_pattern
from ..operate.base import RepoTypeTyping
from ..utils import hf_normpath, hf_fs_path


@dataclass
class RepoFileItem:
"""
Represents a file item in a Hugging Face repository.

This class encapsulates metadata about a single file, including its path,
size, LFS status, and blob ID.

:param path: The file path relative to the repository root.
:param size: The size of the file in bytes.
:param is_lfs: Whether the file is stored using Git LFS.
:param lfs_sha256: The SHA256 hash of the LFS file, if applicable.
:param blob_id: The Git blob ID of the file.
"""

path: str
size: int
is_lfs: bool
lfs_sha256: Optional[str]
blob_id: str

@classmethod
def from_repo_file(cls, repo_file: RepoFile, subdir: str = '') -> 'RepoFileItem':
"""
Create a RepoFileItem from a RepoFile object.

:param repo_file: The RepoFile object to convert.
:param subdir: The subdirectory to use as the base path (default: '').
:return: A new RepoFileItem instance.
"""
subdir = subdir or '.'
return cls(
path=hf_normpath(os.path.relpath(repo_file.path, subdir)),
size=repo_file.lfs.size if repo_file.lfs else repo_file.size,
is_lfs=bool(repo_file.lfs),
lfs_sha256=repo_file.lfs.sha256 if repo_file.lfs else None,
blob_id=repo_file.blob_id,
)

@property
def path_segments(self) -> Tuple[str, ...]:
"""
Get the path segments of the file.

:return: A tuple of path segments.
"""
return tuple(seg for seg in self.path.split('/') if seg and seg != '.')

def __repr__(self):
"""
Return a string representation of the RepoFileItem.

:return: A formatted string representation.
"""
return (f'<{self.__class__.__name__} {self.path}, size: {size_to_bytes_str(self.size, sigfigs=3, system="si")}'
f'{" (LFS)" if self.is_lfs else ""}>')


class RepoFileList(Sequence):
"""
Represents a list of RepoFileItems with additional metadata and utility methods.

This class provides a way to manage and analyze a collection of files from a
Hugging Face repository, including information about the repository itself.

:param repo_id: The ID of the repository.
:param items: A list of RepoFileItem objects.
:param repo_type: The type of the repository (default: 'dataset').
:param revision: The revision of the repository (default: 'main').
:param subdir: The subdirectory within the repository (default: '').
"""

def __init__(self, repo_id: str, items: List[RepoFileItem],
repo_type: RepoTypeTyping = 'dataset', revision: str = 'main', subdir: Optional[str] = ''):
self.repo_id = repo_id
self.repo_type = repo_type
self.revision = revision
self.subdir = hf_normpath(subdir or '.')
self._file_items = list(items)
self._total_size = 0
for item in self._file_items:
self._total_size += item.size

def __getitem__(self, index):
"""
Get a RepoFileItem by index.

:param index: The index of the item to retrieve.
:return: The RepoFileItem at the specified index.
"""
return self._file_items[index]

def __len__(self) -> int:
"""
Get the number of items in the list.

:return: The number of RepoFileItems in the list.
"""
return len(self._file_items)

@property
def total_size(self) -> int:
"""
Get the total size of all files in the list.

:return: The total size in bytes.
"""
return self._total_size

def _tree(self, max_items: Optional[int] = 10):
"""
Generate a tree representation of the file list.

:param max_items: The maximum number of items to include in the tree (default: 10).
:return: A tuple containing the tree title and list of tree nodes.
"""
title = hf_fs_path(
repo_id=self.repo_id,
repo_type=self.repo_type,
revision=self.revision,
filename=self.subdir,
)
return (
f'{title} ({plural_word(len(self._file_items), "file")}, '
f'{size_to_bytes_str(self._total_size, sigfigs=3, system="si")})',
[
*(
(repr(item), []) for item in
(self._file_items[:max_items] if max_items and max_items > 0 else self._file_items)
),
*(
[
(f'... ({plural_word(len(self._file_items), "file")}) in total ...', [])
] if max_items and 0 < max_items < len(self._file_items) else []
)
]
)

def __repr__(self):
"""
Return a string representation of the RepoFileList.

:return: A formatted string representation of the file list.
"""
return format_tree(
self._tree(),
format_node=itemgetter(0),
get_children=itemgetter(1),
)

def repr(self, max_items: Optional[int] = 10):
"""
Generate a custom string representation of the RepoFileList.

:param max_items: The maximum number of items to include in the representation (default: 10).
:return: A formatted string representation of the file list.
"""
return format_tree(
self._tree(max_items=max_items),
format_node=itemgetter(0),
get_children=itemgetter(1),
)


SortByTyping = Literal['none', 'path', 'size']


def hf_hub_repo_analysis(
repo_id: str, pattern: str = '**/*', repo_type: RepoTypeTyping = 'dataset',
revision: str = 'main', hf_token: Optional[str] = None, silent: bool = False,
subdir: str = '', sort_by: SortByTyping = 'path', **kwargs,
) -> RepoFileList:
"""
Analyze the contents of a Hugging Face repository.

This function retrieves file information from a specified repository and creates
a RepoFileList object containing detailed information about each file.

:param repo_id: The ID of the repository to analyze.
:param pattern: A glob pattern to filter files (default: '**/*').
:param repo_type: The type of the repository (default: 'dataset').
:param revision: The revision of the repository to analyze (default: 'main').
:param hf_token: The Hugging Face API token (optional).
:param silent: Whether to suppress output (default: False).
:param subdir: The subdirectory within the repository to analyze (default: '').
:param sort_by: How to sort the file list ('none', 'path', or 'size') (default: 'path').
:param kwargs: Additional keyword arguments to pass to list_all_with_pattern.

:return: A RepoFileList object containing the analysis results.

:raises: May raise exceptions related to API access or file operations.

Usage:
>>> result = hf_hub_repo_analysis('username/repo', pattern='*.txt', repo_type='model')
>>> print(result)
"""
if subdir and subdir != '.':
pattern = f'{subdir}/{pattern}'

file_items = []
for item in list_all_with_pattern(
repo_id=repo_id,
repo_type=repo_type,
revision=revision,
pattern=pattern,
hf_token=hf_token,
silent=silent,
**kwargs
):
if isinstance(item, RepoFile):
file_items.append(RepoFileItem.from_repo_file(item, subdir))

if sort_by != 'none':
file_items = sorted(file_items,
key=lambda x: x.path_segments if sort_by == 'path' else (-x.size, x.path_segments))

return RepoFileList(
repo_id=repo_id,
repo_type=repo_type,
revision=revision,
subdir=subdir,
items=file_items
)
Loading
Loading