diff --git a/docs/source/api_doc/utils/data.rst b/docs/source/api_doc/utils/data.rst new file mode 100644 index 0000000000..e1a70e98cc --- /dev/null +++ b/docs/source/api_doc/utils/data.rst @@ -0,0 +1,15 @@ +hfutils.utils.data +================================= + +.. currentmodule:: hfutils.utils.data + +.. automodule:: hfutils.utils.data + + +is_data_file +--------------------------- + +.. autofunction:: is_data_file + + + diff --git a/docs/source/api_doc/utils/index.rst b/docs/source/api_doc/utils/index.rst index cb0aef8cc5..088c55876a 100644 --- a/docs/source/api_doc/utils/index.rst +++ b/docs/source/api_doc/utils/index.rst @@ -11,7 +11,9 @@ hfutils.utils archive binary + data download + model number path session diff --git a/docs/source/api_doc/utils/model.rst b/docs/source/api_doc/utils/model.rst new file mode 100644 index 0000000000..df369a25a1 --- /dev/null +++ b/docs/source/api_doc/utils/model.rst @@ -0,0 +1,15 @@ +hfutils.utils.model +================================= + +.. currentmodule:: hfutils.utils.model + +.. automodule:: hfutils.utils.model + + +is_model_file +--------------------------- + +.. autofunction:: is_model_file + + + diff --git a/hfutils/utils/data.py b/hfutils/utils/data.py index cf36bd9e79..ec7e57339b 100644 --- a/hfutils/utils/data.py +++ b/hfutils/utils/data.py @@ -1,3 +1,11 @@ +""" +This module provides functionality for identifying data files based on their file extensions. + +It includes a comprehensive set of data file extensions and a function to check if a given +filename corresponds to a known data file format. This can be useful in various data processing +and file handling scenarios where it's necessary to distinguish data files from other types of files. +""" + import os from typing import Union @@ -44,6 +52,32 @@ def is_data_file(filename: Union[str, os.PathLike]) -> bool: + """ + Determine if a given filename corresponds to a known data file format. + + This function checks if the file extension of the provided filename matches + any of the known data file extensions defined in the `_DATA_EXTS` set. + + :param filename: The name of the file to check. Can be a string or a path-like object. + :type filename: Union[str, os.PathLike] + + :return: True if the file extension matches a known data file format, False otherwise. + :rtype: bool + + :raises TypeError: If the provided filename is not a string or path-like object. + + Usage: + >>> is_data_file('data.csv') + True + >>> is_data_file('script.py') + False + >>> is_data_file(Path('/path/to/data.json')) + True + + .. note:: + The function is case-insensitive and works with both file names and full paths. + It normalizes the filename and extracts only the extension for comparison. + """ if not isinstance(filename, (str, os.PathLike)): raise TypeError(f'Unknown file name type - {filename!r}') diff --git a/hfutils/utils/model.py b/hfutils/utils/model.py index ca28bf0cd2..5d60312b20 100644 --- a/hfutils/utils/model.py +++ b/hfutils/utils/model.py @@ -1,3 +1,28 @@ +""" +This module provides functionality for identifying model files based on their extensions and naming patterns. + +It includes a comprehensive list of model file extensions, patterns for sharded model files, and specific patterns +for Hugging Face model files. The main function, :func:`is_model_file`, determines whether a given filename corresponds +to a model file based on these predefined patterns and extensions. + +This module can be useful in various scenarios, such as: + +- Automated model file detection in directories +- Validation of uploaded files in machine learning platforms +- Preprocessing steps in model loading pipelines + +Usage: + .. code:: python + + from model_file_identifier import is_model_file + + filename = "model.pt" + if is_model_file(filename): + print(f"{filename} is a model file") + else: + print(f"{filename} is not a model file") +""" + import os import re from typing import Union @@ -74,6 +99,33 @@ def is_model_file(filename: Union[str, os.PathLike]) -> bool: + """ + Determine if a given filename corresponds to a model file. + + This function checks if the provided filename matches any of the known model file extensions + or patterns, including sharded model files and Hugging Face specific patterns. + + :param filename: The name of the file to check. Can be a full path or just the filename. + :type filename: Union[str, os.PathLike] + + :return: True if the filename corresponds to a model file, False otherwise. + :rtype: bool + + :raises TypeError: If the filename is not a string or os.PathLike object. + + Usage: + >>> is_model_file("model.pt") + True + >>> is_model_file("data.csv") + False + >>> is_model_file("model-00001-of-00005") + True + >>> is_model_file("pytorch_model.bin") + True + + .. note:: + This function is case-insensitive and works with both file names and full paths. + """ if not isinstance(filename, (str, os.PathLike)): raise TypeError(f'Unknown file name type - {filename!r}') filename = os.path.basename(os.path.normcase(str(filename)))