Skip to content

Commit

Permalink
Reformat API doc and generate docs automatically (#299)
Browse files Browse the repository at this point in the history
  • Loading branch information
pan-x-c authored Apr 18, 2024
1 parent 1d94a67 commit 1647e3d
Show file tree
Hide file tree
Showing 41 changed files with 329 additions and 1,210 deletions.
Original file line number Diff line number Diff line change
@@ -1,9 +1,13 @@
name: Deploy Sphinx documentation to Pages

on:
release:
types: [published]
workflow_dispatch:
pull_request:
types: [opened, synchronize]
paths:
- 'docs/sphinx_doc/**/*'
push:
branches:
- main

jobs:
pages:
Expand All @@ -19,14 +23,18 @@ jobs:
run: |
python -m pip install --upgrade pip
pip install -v -e .[dev]
- id: deployment
uses: sphinx-notes/pages@v3
- id: build
name: Build Documentation
run: |
cd docs/sphinx_doc
bash build_doc.sh
- name: Upload Documentation
uses: actions/upload-artifact@v3
with:
documentation_path: ./docs/sphinx_doc/source
python_version: ${{ matrix.python-version }}
publish: false
requirements_path: ./environments/dev_requires.txt
name: SphinxDoc
path: 'docs/sphinx_doc/build/html'
- uses: peaceiris/actions-gh-pages@v3
if: ${{ github.event_name == 'push' && github.ref == 'refs/heads/main' }}
with:
github_token: ${{ secrets.GITHUB_TOKEN }}
publish_dir: ${{ steps.deployment.outputs.artifact }}
publish_dir: 'docs/sphinx_doc/build/html'
7 changes: 7 additions & 0 deletions data_juicer/analysis/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,9 @@
from .column_wise_analysis import ColumnWiseAnalysis
from .diversity_analysis import DiversityAnalysis
from .overall_analysis import OverallAnalysis

__all__ = [
'ColumnWiseAnalysis',
'DiversityAnalysis',
'OverallAnalysis',
]
8 changes: 7 additions & 1 deletion data_juicer/config/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,7 @@
from .config import * # noqa: F401,F403
from .config import export_config, init_configs, merge_config

__all__ = [
'init_configs',
'export_config',
'merge_config',
]
8 changes: 8 additions & 0 deletions data_juicer/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,11 @@
from .executor import Executor
from .exporter import Exporter
from .tracer import Tracer

__all__ = [
'Analyser',
'NestedDataset',
'Executor',
'Exporter',
'Tracer',
]
13 changes: 13 additions & 0 deletions data_juicer/format/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,16 @@
from . import (csv_formatter, json_formatter, mixture_formatter,
parquet_formatter, text_formatter, tsv_formatter)
from .csv_formatter import CsvFormatter
from .formatter import LocalFormatter, RemoteFormatter
from .json_formatter import JsonFormatter
from .load import load_formatter
from .mixture_formatter import MixtureFormatter
from .parquet_formatter import ParquetFormatter
from .text_formatter import TextFormatter
from .tsv_formatter import TsvFormatter

__all__ = [
'load_formatter', 'JsonFormatter', 'LocalFormatter', 'RemoteFormatter',
'TextFormatter', 'ParquetFormatter', 'CsvFormatter', 'TsvFormatter',
'MixtureFormatter'
]
8 changes: 8 additions & 0 deletions data_juicer/ops/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
from . import deduplicator, filter, mapper, selector
from .base_op import OPERATORS, Deduplicator, Filter, Mapper, Selector
from .load import load_ops

__all__ = [
'load_ops',
'Filter',
'Mapper',
'Deduplicator',
'Selector',
]
11 changes: 11 additions & 0 deletions data_juicer/ops/common/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,14 @@
split_on_newline_tab_whitespace, split_on_whitespace,
strip, words_augmentation, words_refinement)
from .special_characters import SPECIAL_CHARACTERS

__all__ = [
'get_sentences_from_document',
'get_words_from_document',
'merge_on_whitespace_tab_newline',
'split_on_newline_tab_whitespace',
'split_on_whitespace',
'strip',
'words_augmentation',
'words_refinement',
]
4 changes: 2 additions & 2 deletions data_juicer/ops/common/helper_func.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,8 @@ def get_words_from_document(
:param document: document that need to split words.
:param token_func: function of tokenizer, if specified, the function
will be used for split document into different tokens.
:param new_line: whether to use `\\\\n' to split words.
will be used for split document into different tokens.
:param new_line: whether to use '\\\\n' to split words.
:param tab: whether to use '\\\\t' to split words.
:return: word list obtained from document
"""
Expand Down
15 changes: 15 additions & 0 deletions data_juicer/ops/deduplicator/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,18 @@
document_simhash_deduplicator, image_deduplicator,
ray_document_deduplicator, ray_image_deduplicator,
ray_video_deduplicator, video_deduplicator)
from .document_deduplicator import DocumentDeduplicator
from .document_minhash_deduplicator import DocumentMinhashDeduplicator
from .document_simhash_deduplicator import DocumentSimhashDeduplicator
from .image_deduplicator import ImageDeduplicator
from .ray_basic_deduplicator import RayBasicDeduplicator
from .ray_document_deduplicator import RayDocumentDeduplicator
from .ray_image_deduplicator import RayImageDeduplicator
from .ray_video_deduplicator import RayVideoDeduplicator
from .video_deduplicator import VideoDeduplicator

__all__ = [
'VideoDeduplicator', 'RayBasicDeduplicator', 'DocumentMinhashDeduplicator',
'RayImageDeduplicator', 'RayDocumentDeduplicator', 'DocumentDeduplicator',
'ImageDeduplicator', 'DocumentSimhashDeduplicator', 'RayVideoDeduplicator'
]
2 changes: 1 addition & 1 deletion data_juicer/ops/deduplicator/ray_document_deduplicator.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def __init__(self,
:param redis_port: the port of redis server
:param lowercase: Whether to convert sample text to lower case
:param ignore_non_character: Whether to ignore non-alphabet
characters, including whitespaces, digits, and punctuations
characters, including whitespaces, digits, and punctuations
:param args: extra args
:param kwargs: extra args.
"""
Expand Down
86 changes: 86 additions & 0 deletions data_juicer/ops/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,5 +18,91 @@
video_nsfw_filter, video_ocr_area_ratio_filter,
video_resolution_filter, video_tagging_from_frames_filter,
video_watermark_filter, word_num_filter, word_repetition_filter)
from .alphanumeric_filter import AlphanumericFilter
from .audio_duration_filter import AudioDurationFilter
from .audio_nmf_snr_filter import AudioNMFSNRFilter
from .audio_size_filter import AudioSizeFilter
from .average_line_length_filter import AverageLineLengthFilter
from .character_repetition_filter import CharacterRepetitionFilter
from .flagged_words_filter import FlaggedWordFilter
from .image_aesthetics_filter import ImageAestheticsFilter
from .image_aspect_ratio_filter import ImageAspectRatioFilter
from .image_face_ratio_filter import ImageFaceRatioFilter
from .image_nsfw_filter import ImageNSFWFilter
from .image_shape_filter import ImageShapeFilter
from .image_size_filter import ImageSizeFilter
from .image_text_matching_filter import ImageTextMatchingFilter
from .image_text_similarity_filter import ImageTextSimilarityFilter
from .image_watermark_filter import ImageWatermarkFilter
from .language_id_score_filter import LanguageIDScoreFilter
from .maximum_line_length_filter import MaximumLineLengthFilter
from .perplexity_filter import PerplexityFilter
from .phrase_grounding_recall_filter import PhraseGroundingRecallFilter
from .special_characters_filter import SpecialCharactersFilter
from .specified_field_filter import SpecifiedFieldFilter
from .specified_numeric_field_filter import SpecifiedNumericFieldFilter
from .stopwords_filter import StopWordsFilter
from .suffix_filter import SuffixFilter
from .text_action_filter import TextActionFilter
from .text_entity_dependency_filter import TextEntityDependencyFilter
from .text_length_filter import TextLengthFilter
from .token_num_filter import TokenNumFilter
from .video_aesthetics_filter import VideoAestheticsFilter
from .video_aspect_ratio_filter import VideoAspectRatioFilter
from .video_duration_filter import VideoDurationFilter
from .video_frames_text_similarity_filter import \
VideoFramesTextSimilarityFilter
from .video_motion_score_filter import VideoMotionScoreFilter
from .video_nsfw_filter import VideoNSFWFilter
from .video_ocr_area_ratio_filter import VideoOcrAreaRatioFilter
from .video_resolution_filter import VideoResolutionFilter
from .video_tagging_from_frames_filter import VideoTaggingFromFramesFilter
from .video_watermark_filter import VideoWatermarkFilter
from .word_num_filter import WordNumFilter
from .word_repetition_filter import WordRepetitionFilter

__all__ = [
'ImageTextSimilarityFilter',
'VideoAspectRatioFilter',
'ImageTextMatchingFilter',
'ImageNSFWFilter',
'TokenNumFilter',
'TextLengthFilter',
'SpecifiedNumericFieldFilter',
'AudioNMFSNRFilter',
'VideoAestheticsFilter',
'PerplexityFilter',
'PhraseGroundingRecallFilter',
'MaximumLineLengthFilter',
'AverageLineLengthFilter',
'SpecifiedFieldFilter',
'VideoTaggingFromFramesFilter',
'TextEntityDependencyFilter',
'VideoResolutionFilter',
'AlphanumericFilter',
'ImageWatermarkFilter',
'ImageAestheticsFilter',
'AudioSizeFilter',
'StopWordsFilter',
'CharacterRepetitionFilter',
'ImageShapeFilter',
'VideoDurationFilter',
'TextActionFilter',
'VideoOcrAreaRatioFilter',
'VideoNSFWFilter',
'SpecialCharactersFilter',
'VideoFramesTextSimilarityFilter',
'ImageAspectRatioFilter',
'AudioDurationFilter',
'LanguageIDScoreFilter',
'SuffixFilter',
'ImageSizeFilter',
'VideoWatermarkFilter',
'WordNumFilter',
'ImageFaceRatioFilter',
'FlaggedWordFilter',
'WordRepetitionFilter',
'VideoMotionScoreFilter',
]

# yapf: enable
93 changes: 93 additions & 0 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,98 @@
video_tagging_from_audio_mapper,
video_tagging_from_frames_mapper,
whitespace_normalization_mapper)
from .audio_ffmpeg_wrapped_mapper import AudioFFmpegWrappedMapper
from .chinese_convert_mapper import ChineseConvertMapper
from .clean_copyright_mapper import CleanCopyrightMapper
from .clean_email_mapper import CleanEmailMapper
from .clean_html_mapper import CleanHtmlMapper
from .clean_ip_mapper import CleanIpMapper
from .clean_links_mapper import CleanLinksMapper
from .expand_macro_mapper import ExpandMacroMapper
from .fix_unicode_mapper import FixUnicodeMapper
from .image_blur_mapper import ImageBlurMapper
from .image_captioning_from_gpt4v_mapper import ImageCaptioningFromGPT4VMapper
from .image_captioning_mapper import ImageCaptioningMapper
from .image_diffusion_mapper import ImageDiffusionMapper
from .image_face_blur_mapper import ImageFaceBlurMapper
from .nlpaug_en_mapper import NlpaugEnMapper
from .nlpcda_zh_mapper import NlpcdaZhMapper
from .punctuation_normalization_mapper import PunctuationNormalizationMapper
from .remove_bibliography_mapper import RemoveBibliographyMapper
from .remove_comments_mapper import RemoveCommentsMapper
from .remove_header_mapper import RemoveHeaderMapper
from .remove_long_words_mapper import RemoveLongWordsMapper
from .remove_non_chinese_character_mapper import \
RemoveNonChineseCharacterlMapper
from .remove_repeat_sentences_mapper import RemoveRepeatSentencesMapper
from .remove_specific_chars_mapper import RemoveSpecificCharsMapper
from .remove_table_text_mapper import RemoveTableTextMapper
from .remove_words_with_incorrect_substrings_mapper import \
RemoveWordsWithIncorrectSubstringsMapper
from .replace_content_mapper import ReplaceContentMapper
from .sentence_split_mapper import SentenceSplitMapper
from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
from .video_captioning_from_frames_mapper import \
VideoCaptioningFromFramesMapper
from .video_captioning_from_summarizer_mapper import \
VideoCaptioningFromSummarizerMapper
from .video_captioning_from_video_mapper import VideoCaptioningFromVideoMapper
from .video_face_blur_mapper import VideoFaceBlurMapper
from .video_ffmpeg_wrapped_mapper import VideoFFmpegWrappedMapper
from .video_remove_watermark_mapper import VideoRemoveWatermarkMapper
from .video_resize_aspect_ratio_mapper import VideoResizeAspectRatioMapper
from .video_resize_resolution_mapper import VideoResizeResolutionMapper
from .video_split_by_duration_mapper import VideoSplitByDurationMapper
from .video_split_by_key_frame_mapper import VideoSplitByKeyFrameMapper
from .video_split_by_scene_mapper import VideoSplitBySceneMapper
from .video_tagging_from_audio_mapper import VideoTaggingFromAudioMapper
from .video_tagging_from_frames_mapper import VideoTaggingFromFramesMapper
from .whitespace_normalization_mapper import WhitespaceNormalizationMapper

__all__ = [
'VideoCaptioningFromAudioMapper',
'VideoTaggingFromAudioMapper',
'ImageCaptioningFromGPT4VMapper',
'PunctuationNormalizationMapper',
'RemoveBibliographyMapper',
'SentenceSplitMapper',
'VideoSplitBySceneMapper',
'CleanIpMapper',
'CleanLinksMapper',
'RemoveHeaderMapper',
'RemoveTableTextMapper',
'VideoRemoveWatermarkMapper',
'RemoveRepeatSentencesMapper',
'ImageDiffusionMapper',
'ImageFaceBlurMapper',
'VideoFFmpegWrappedMapper',
'ChineseConvertMapper',
'NlpcdaZhMapper',
'ImageBlurMapper',
'CleanCopyrightMapper',
'RemoveNonChineseCharacterlMapper',
'VideoSplitByKeyFrameMapper',
'RemoveSpecificCharsMapper',
'VideoResizeAspectRatioMapper',
'CleanHtmlMapper',
'WhitespaceNormalizationMapper',
'VideoTaggingFromFramesMapper',
'RemoveCommentsMapper',
'ExpandMacroMapper',
'ImageCaptioningMapper',
'RemoveWordsWithIncorrectSubstringsMapper',
'VideoCaptioningFromVideoMapper',
'VideoCaptioningFromSummarizerMapper',
'FixUnicodeMapper',
'NlpaugEnMapper',
'VideoCaptioningFromFramesMapper',
'RemoveLongWordsMapper',
'VideoResizeResolutionMapper',
'CleanEmailMapper',
'ReplaceContentMapper',
'AudioFFmpegWrappedMapper',
'VideoSplitByDurationMapper',
'VideoFaceBlurMapper',
]

# yapf: enable
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/image_face_blur_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self,
Initialization method.
:param blur_type: Type of blur kernel, including
['mean', 'box', 'gaussian'].
['mean', 'box', 'gaussian'].
:param radius: Radius of blur kernel.
:param args: extra args
:param kwargs: extra args
Expand Down
2 changes: 1 addition & 1 deletion data_juicer/ops/mapper/video_face_blur_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ def __init__(self,
Initialization method.
:param blur_type: Type of blur kernel, including
['mean', 'box', 'gaussian'].
['mean', 'box', 'gaussian'].
:param radius: Radius of blur kernel.
:param args: extra args
:param kwargs: extra args
Expand Down
4 changes: 4 additions & 0 deletions data_juicer/ops/selector/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1,5 @@
from . import frequency_specified_field_selector, topk_specified_field_selector
from .frequency_specified_field_selector import FrequencySpecifiedFieldSelector
from .topk_specified_field_selector import TopkSpecifiedFieldSelector

__all__ = ['FrequencySpecifiedFieldSelector', 'TopkSpecifiedFieldSelector']
3 changes: 2 additions & 1 deletion data_juicer/utils/process_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,14 @@
import subprocess

import psutil
import torch
from loguru import logger

from data_juicer import cuda_device_count, use_cuda


def get_min_cuda_memory():
# get cuda memory info using "nvidia-smi" command
import torch
min_cuda_memory = torch.cuda.get_device_properties(
0).total_memory / 1024**2
nvidia_smi_output = subprocess.check_output([
Expand All @@ -23,6 +23,7 @@ def get_min_cuda_memory():


def calculate_np(num_proc, op, op_name):
"""Calculate the optimum number of processes for the given OP"""
if num_proc is None:
num_proc = psutil.cpu_count()
if use_cuda() and op._accelerator == 'cuda':
Expand Down
Loading

0 comments on commit 1647e3d

Please sign in to comment.