From 77d908fdc7d12145c0ae047fd38d56beba2652df Mon Sep 17 00:00:00 2001 From: yxdyc Date: Mon, 12 Aug 2024 10:12:49 +0000 Subject: [PATCH] deploy: 625861bbb87bc70d92df57470df2de747f67f280 --- _modules/data_juicer/core/analyzer.html | 245 --------- _modules/data_juicer/core/data.html | 502 ------------------ _modules/data_juicer/core/executor.html | 304 ----------- _modules/data_juicer/core/exporter.html | 380 ------------- _modules/data_juicer/core/tracer.html | 338 ------------ .../data_juicer/format/csv_formatter.html | 140 ----- _modules/data_juicer/format/formatter.html | 440 --------------- .../data_juicer/format/json_formatter.html | 140 ----- _modules/data_juicer/format/load.html | 141 ----- .../data_juicer/format/mixture_formatter.html | 258 --------- .../data_juicer/format/parquet_formatter.html | 140 ----- .../data_juicer/format/text_formatter.html | 273 ---------- .../data_juicer/format/tsv_formatter.html | 141 ----- _modules/index.html | 13 - data_juicer.core.html | 421 +-------------- data_juicer.format.html | 346 +----------- genindex.html | 180 +------ index.html | 22 +- objects.inv | Bin 5335 -> 4761 bytes py-modindex.html | 10 - searchindex.js | 2 +- 21 files changed, 19 insertions(+), 4417 deletions(-) delete mode 100644 _modules/data_juicer/core/analyzer.html delete mode 100644 _modules/data_juicer/core/data.html delete mode 100644 _modules/data_juicer/core/executor.html delete mode 100644 _modules/data_juicer/core/exporter.html delete mode 100644 _modules/data_juicer/core/tracer.html delete mode 100644 _modules/data_juicer/format/csv_formatter.html delete mode 100644 _modules/data_juicer/format/formatter.html delete mode 100644 _modules/data_juicer/format/json_formatter.html delete mode 100644 _modules/data_juicer/format/load.html delete mode 100644 _modules/data_juicer/format/mixture_formatter.html delete mode 100644 _modules/data_juicer/format/parquet_formatter.html delete mode 100644 _modules/data_juicer/format/text_formatter.html delete mode 100644 _modules/data_juicer/format/tsv_formatter.html diff --git a/_modules/data_juicer/core/analyzer.html b/_modules/data_juicer/core/analyzer.html deleted file mode 100644 index d4c984cd6..000000000 --- a/_modules/data_juicer/core/analyzer.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - data_juicer.core.analyzer — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.analyzer

-import os
-
-from loguru import logger
-
-from data_juicer.analysis import ColumnWiseAnalysis, OverallAnalysis
-from data_juicer.config import init_configs
-from data_juicer.format import load_formatter
-from data_juicer.ops import Filter, load_ops
-from data_juicer.utils import cache_utils
-
-from .exporter import Exporter
-
-
-
[docs]class Analyzer: - """ - This Analyzer class is used to analyze a specific dataset. - - It will compute stats for all filter ops in the config file, apply - multiple analysis (e.g. OverallAnalysis, ColumnWiseAnalysis, etc.) - on these stats, and generate the analysis results (stats tables, - distribution figures, etc.) to help users understand the input - dataset better. - """ - -
[docs] def __init__(self, cfg=None): - """ - Initialization method. - - :param cfg: optional config dict. - """ - self.cfg = init_configs() if cfg is None else cfg - - self.work_dir = self.cfg.work_dir - self.ops = None - - if self.cfg.use_cache: - logger.info(f'Using cache compression method: ' - f'[{self.cfg.cache_compress}]') - cache_utils.CACHE_COMPRESS = self.cfg.cache_compress - - # setup formatter - logger.info('Setting up data formatter...') - self.formatter = load_formatter(self.cfg.dataset_path, - self.cfg.text_keys, self.cfg.suffixes, - self.cfg.add_suffix) - - # prepare exporter and check export path suffix - # NOTICE: no need to export dataset texts for analyzer - # (export_ds=False). Instead, only need to export stats - # (export_stats=True). - logger.info('Preparing exporter...') - self.exporter = Exporter( - self.cfg.export_path, - self.cfg.export_shard_size, - self.cfg.export_in_parallel, - self.cfg.np, - export_ds=self.cfg.export_original_dataset, - keep_stats_in_res_ds=self.cfg.export_original_dataset, - export_stats=True) - - # parsed_res - self.overall_result = None - self.overall_single_plot_path = None - self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
- -
[docs] def run(self, load_data_np=None, skip_export=False): - """ - Running the dataset analysis pipeline. - - :param load_data_np: number of workers when loading the dataset. - :param skip_export: whether export the results into disk - :return: analyzed dataset. - """ - # 1. format data - logger.info('Loading dataset from data formatter...') - if load_data_np is None: - load_data_np = self.cfg.np - dataset = self.formatter.load_dataset(load_data_np, self.cfg) - - # extract processes - logger.info('Preparing process operators...') - self.cfg.process, self.ops = load_ops(self.cfg.process, - self.cfg.op_fusion) - - # 2. stats precompute only for filter ops - logger.info('Computing the stats of dataset...') - stats_collected = False - for op in self.ops: - if isinstance(op, Filter): - original_process = op.process - op.process = None - dataset = dataset.process(op) - op.process = original_process - stats_collected = True - if not stats_collected: - logger.warning('No stats collected. Please add some Filter ops to ' - 'the process list in configs.') - return dataset - - # 3. data export - logger.info('Exporting dataset to disk...') - self.exporter.export(dataset) - if self.cfg.use_cache and self.cfg.cache_compress: - from data_juicer.utils.compress import compress - compress(dataset) - - # 4. analysis and output result to the export path - # 4.1. Only consider fields in Fields.stats - # 4.2. For string fields, only consider its histogram - # 4.3. For numeric fields, consider its histogram and box - # 4.4. Otherwise, DO NOT analyze - - logger.info('Applying overall analysis on stats...') - overall_analysis = OverallAnalysis(dataset, self.analysis_path) - self.overall_result = overall_analysis.analyze( - percentiles=self.cfg.percentiles, - num_proc=self.cfg.np, - skip_export=skip_export) - - logger.info(f'The overall analysis results are: {self.overall_result}') - - logger.info('Applying column-wise analysis on stats...') - column_wise_analysis = ColumnWiseAnalysis( - dataset, - self.analysis_path, - overall_result=self.overall_result, - save_stats_in_one_file=self.cfg.save_stats_in_one_file, - ) - column_wise_analysis.analyze(skip_export=skip_export) - - return dataset
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html deleted file mode 100644 index 39c748ebf..000000000 --- a/_modules/data_juicer/core/data.html +++ /dev/null @@ -1,502 +0,0 @@ - - - - - - data_juicer.core.data — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.data

-from __future__ import annotations
-
-import copy
-import inspect
-from abc import ABC, abstractmethod
-from functools import wraps
-from time import time
-from typing import Union
-
-from datasets import Dataset, DatasetDict, is_caching_enabled
-from datasets.formatting.formatting import LazyBatch
-from loguru import logger
-
-from data_juicer.ops import UNFORKABLE
-from data_juicer.utils import cache_utils
-from data_juicer.utils.compress import (CompressionOff,
-                                        cleanup_compressed_cache_files,
-                                        compress, decompress)
-from data_juicer.utils.fingerprint_utils import generate_fingerprint
-from data_juicer.utils.process_utils import setup_mp
-
-
-class DJDataset(ABC):
-    """Base dataset of DJ"""
-
-    @abstractmethod
-    def process(
-            self,
-            operators,  # TODO: add type hint
-            *,
-            exporter=None,
-            checkpointer=None,
-            tracer=None) -> DJDataset:
-        """process a list of operators on the dataset."""
-        pass
-
-
-def wrap_func_with_nested_access(f):
-    """
-    Before conducting actual function `f`, wrap its args and kargs into nested
-    ones.
-
-    :param f: function to be wrapped.
-    :return: wrapped function
-    """
-
-    def wrap_nested_structure(*args, **kargs):
-        wrapped_args = [nested_obj_factory(arg) for arg in args]
-        wrapped_kargs = {
-            k: nested_obj_factory(arg)
-            for k, arg in kargs.items()
-        }
-        return wrapped_args, nested_obj_factory(wrapped_kargs)
-
-    @wraps(f)
-    def wrapped_f(*args, **kargs):
-        args, kargs = wrap_nested_structure(*args, **kargs)
-        # to ensure the args passing to the final calling of f can be nested,
-        # in case of deeper-order wrapper funcs de-wrap this nesting behavior
-        args = [
-            wrap_func_with_nested_access(arg) if callable(arg) else arg
-            for arg in args
-        ]
-        kargs = {
-            k: (wrap_func_with_nested_access(arg) if callable(arg) else arg)
-            for (k, arg) in kargs.items()
-        }
-        return f(*args, **kargs)
-
-    return wrapped_f
-
-
-def nested_obj_factory(obj):
-    """
-    Use nested classes to wrap the input object.
-
-    :param obj: object to be nested.
-    :return: nested object
-    """
-    if isinstance(obj, Dataset):
-        return NestedDataset(obj)
-    elif isinstance(obj, DatasetDict):
-        return NestedDatasetDict(obj)
-    elif isinstance(obj, dict):
-        return NestedQueryDict(obj)
-    elif isinstance(obj, LazyBatch):
-        obj.data = NestedQueryDict(obj.data)
-        return obj
-    elif isinstance(obj, list):
-        return [nested_obj_factory(item) for item in obj]
-    else:
-        return obj
-
-
-class NestedQueryDict(dict):
-    """Enhanced dict for better usability."""
-
-    def __init__(self, *args, **kargs):
-        if len(args) == 1 and isinstance(args[0], Dataset):
-            # init from another DatasetDict instance
-            self.__dict__ = copy.copy(args[0].__dict__)
-        else:
-            # init from scratch
-            super().__init__(*args, **kargs)
-
-        # batched sample, (k & v) are organized by list manner
-        for k, v in self.items():
-            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
-                self[k] = [NestedQueryDict(item) for item in v]
-
-    def __getitem__(self, key):
-        return nested_query(self, key)
-
-
-class NestedDatasetDict(DatasetDict):
-    """Enhanced HuggingFace-DatasetDict for better usability and efficiency."""
-
-    def __init__(self, *args, **kargs):
-        if len(args) == 1 and isinstance(args[0], Dataset):
-            # init from another DatasetDict instance
-            self.__dict__ = copy.copy(args[0].__dict__)
-        else:
-            # init from scratch
-            super().__init__(*args, **kargs)
-
-    def __getitem__(self, key):
-        return nested_query(self, key)
-
-    def map(self, **args):
-        """Override the map func, which is called by most common operations,
-        such that the processed samples can be accessed by nested manner."""
-        if 'function' not in args or args['function'] is None:
-            args['function'] = lambda x: nested_obj_factory(x)
-        else:
-            args['function'] = wrap_func_with_nested_access(args['function'])
-
-        return super().map(**args)
-
-
-
[docs]class NestedDataset(Dataset, DJDataset): - """Enhanced HuggingFace-Dataset for better usability and efficiency.""" - -
[docs] def __init__(self, *args, **kargs): - if len(args) == 1 and isinstance(args[0], Dataset): - # init from another Dataset instance - self.__dict__ = copy.copy(args[0].__dict__) - else: - # init from scratch - super().__init__(*args, **kargs) - - self.need_to_cleanup_caches = not is_caching_enabled()
- - def __getitem__(self, key): - if isinstance(key, str): - # to index columns by query as string name(s) - res = nested_query(self, key) - else: - # to index rows by query as integer index, slices, - # or iter of indices or bools - res = super().__getitem__(key) - return nested_obj_factory(res) - -
[docs] def process(self, - operators, - *, - exporter=None, - checkpointer=None, - tracer=None): - if operators is None: - return self - - if not isinstance(operators, list): - operators = [operators] - unforkable_operators = set(UNFORKABLE.modules.keys()) - - dataset = self - for op in operators: - mp_context = ['forkserver', 'spawn'] if ( - op.use_cuda() or op._name in unforkable_operators) else None - setup_mp(mp_context) - - start = time() - # run single op - dataset = op(dataset, - exporter=exporter, - checkpointer=checkpointer, - tracer=tracer) - # record processed ops - if checkpointer is not None: - checkpointer.record(op._name, - list(op._process_kwargs.values())[0]) - end = time() - logger.info(f'OP [{op._name}] Done in {end - start:.3f}s. ' - f'Left {len(dataset)} samples.') - return dataset
- -
[docs] def map(self, *args, **kargs): - """Override the map func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" - if args: - args = list(args) - # the first positional para is function - if args[0] is None: - args[0] = lambda x: nested_obj_factory(x) - else: - args[0] = wrap_func_with_nested_access(args[0]) - called_func = args[0] - else: - if 'function' not in kargs or kargs['function'] is None: - kargs['function'] = lambda x: nested_obj_factory(x) - else: - kargs['function'] = wrap_func_with_nested_access( - kargs['function']) - called_func = kargs['function'] - - # For wrapped function, try to get its unwrapped (bound) method - while not inspect.ismethod(called_func) and hasattr( - called_func, '__wrapped__'): - called_func = called_func.__wrapped__ - - # Batched is always required for fault tolerance - if inspect.ismethod(called_func): - kargs['batched'] = True - kargs['batch_size'] = kargs.pop( - 'batch_size', 1) if called_func.__self__.is_batched_op() else 1 - - if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: - new_fingerprint = generate_fingerprint(self, *args, **kargs) - kargs['new_fingerprint'] = new_fingerprint - - if cache_utils.CACHE_COMPRESS: - decompress(self, kargs['new_fingerprint'], - kargs['num_proc'] if 'num_proc' in kargs else 1) - - new_ds = NestedDataset(super().map(*args, **kargs)) - - if cache_utils.CACHE_COMPRESS: - compress(self, new_ds, - kargs['num_proc'] if 'num_proc' in kargs else 1) - - if self.need_to_cleanup_caches: - new_ds.cleanup_cache_files() - - return new_ds
- -
[docs] def filter(self, *args, **kargs): - """Override the filter func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" - if args: - args = list(args) - # the first positional para is function - if args[0] is None: - args[0] = lambda x: nested_obj_factory(x) - else: - args[0] = wrap_func_with_nested_access(args[0]) - else: - if 'function' not in kargs or kargs['function'] is None: - kargs['function'] = lambda x: nested_obj_factory(x) - else: - kargs['function'] = wrap_func_with_nested_access( - kargs['function']) - - if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: - new_fingerprint = generate_fingerprint(self, *args, **kargs) - kargs['new_fingerprint'] = new_fingerprint - - # For filter, it involves a map and a filter operations, so the final - # cache files includes two sets with different fingerprint (before and - # after). So we need to decompress these two sets of compressed cache - # files - if cache_utils.CACHE_COMPRESS: - decompress(self, [kargs['new_fingerprint'], self._fingerprint], - kargs['num_proc'] if 'num_proc' in kargs else 1) - - # Turn off the compression due to it invokes map actually in the filter - # function. For cache file changes, map: A -> B, filter: A -> A, B. If - # we compress the caches of map, ops after filter cannot find the cache - # files A. So we turn off the inner cache compression for filter. - # Same for cleaning up cache files. - with CompressionOff(): - prev_state = self.need_to_cleanup_caches - self.need_to_cleanup_caches = False - new_ds = NestedDataset(super().filter(*args, **kargs)) - self.need_to_cleanup_caches = prev_state - - if cache_utils.CACHE_COMPRESS: - compress(self, new_ds, - kargs['num_proc'] if 'num_proc' in kargs else 1) - - if self.need_to_cleanup_caches: - new_ds.cleanup_cache_files() - - return new_ds
- -
[docs] def select(self, *args, **kargs): - """Override the select func, such that selected samples can be accessed - by nested manner.""" - return nested_obj_factory(super().select(*args, **kargs))
- -
[docs] @classmethod - def from_dict(cls, *args, **kargs): - """Override the from_dict func, which is called by most from_xx - constructors, such that the constructed dataset object is - NestedDataset.""" - return NestedDataset(super().from_dict(*args, **kargs))
- -
[docs] def add_column(self, *args, **kargs): - """Override the add column func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().add_column(*args, **kargs))
- -
[docs] def select_columns(self, *args, **kargs): - """Override the select columns func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().select_columns(*args, **kargs))
- -
[docs] def remove_columns(self, *args, **kargs): - """Override the remove columns func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().remove_columns(*args, **kargs))
- -
[docs] def cleanup_cache_files(self): - """Override the cleanup_cache_files func, clear raw and compressed - cache files.""" - cleanup_compressed_cache_files(self) - return super().cleanup_cache_files()
- - -def nested_query(root_obj: Union[NestedDatasetDict, NestedDataset, - NestedQueryDict], key): - """ - Find item from a given object, by first checking flatten layer, then - checking nested layers. - - :param root_obj: the object - :param key: the stored item to be queried, e.g., "meta" or - "meta.date" - :return: - """ - subkeys = key.split('.') - - tmp = root_obj - for i in range(len(subkeys)): - try: - key_to_query = '.'.join(subkeys[i:len(subkeys)]) - if isinstance(tmp, - (NestedQueryDict, NestedDataset, NestedDatasetDict)): - # access field using base_class's func to avoid endless loop - res = super(type(tmp), tmp).__getitem__(key_to_query) - elif isinstance(tmp, list): - # NestedDataset may return multiple rows as list - res = [nested_query(item, key_to_query) for item in tmp] - else: - # NestedQueryDict may return single row - res = tmp[key_to_query] - if res is not None: - return res - except Exception as outer_get_error: - exist_in_dict = issubclass(type(tmp), dict) and \ - '.'.join(subkeys[i:i + 1]) in tmp - exist_in_dataset = issubclass(type(tmp), Dataset) and '.'.join( - subkeys[i:i + 1]) in tmp.features - if exist_in_dict or exist_in_dataset: - # dive into next level - tmp = nested_obj_factory(tmp['.'.join(subkeys[i:i + 1])]) - else: - logger.debug( - f'cannot find item given key={key} in dataset=' - f'{root_obj}. For the final caught outer-exception,' - f'type is: {type(outer_get_error)}, ' - f'info is: {outer_get_error}') - return None - - return None - - -def add_same_content_to_new_column(sample, - new_column_name, - initial_value=None): - """ - A helper function to speed up add_column function. Apply map on this - function in parallel instead of using add_column. - :param sample: a single sample to add this new column/field. - :param new_column_name: the name of this new column/field. - :param initial_value: the initial value of this new column/field. - """ - sample[new_column_name] = initial_value - return sample -
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/executor.html b/_modules/data_juicer/core/executor.html deleted file mode 100644 index b20696fc4..000000000 --- a/_modules/data_juicer/core/executor.html +++ /dev/null @@ -1,304 +0,0 @@ - - - - - - data_juicer.core.executor — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.executor

-import os
-import traceback
-from time import time
-
-from loguru import logger
-
-from data_juicer.config import init_configs
-from data_juicer.core.data import Dataset
-from data_juicer.format.load import load_formatter
-from data_juicer.format.mixture_formatter import MixtureFormatter
-from data_juicer.ops import OPERATORS, load_ops
-from data_juicer.utils import cache_utils
-from data_juicer.utils.ckpt_utils import CheckpointManager
-
-from ..ops.selector.frequency_specified_field_selector import \
-    FrequencySpecifiedFieldSelector
-from ..ops.selector.topk_specified_field_selector import \
-    TopkSpecifiedFieldSelector
-from .exporter import Exporter
-from .tracer import Tracer
-
-
-
[docs]class Executor: - """ - This Executor class is used to process a specific dataset. - - It will load the dataset and unify the format, then apply all the - ops in the config file in order and generate a processed dataset. - """ - -
[docs] def __init__(self, cfg=None): - """ - Initialization method. - - :param cfg: optional config dict. - """ - self.cfg = init_configs() if cfg is None else cfg - - self.work_dir = self.cfg.work_dir - - self.ops = None - self.tracer = None - self.ckpt_manager = None - - # only enable it when using cache - if self.cfg.use_cache: - logger.info(f'Using cache compression method: ' - f'[{self.cfg.cache_compress}]') - cache_utils.CACHE_COMPRESS = self.cfg.cache_compress - - # setup formatter - logger.info('Setting up data formatter...') - self.formatter = load_formatter(self.cfg.dataset_path, - self.cfg.text_keys, self.cfg.suffixes, - self.cfg.add_suffix) - - # whether to use checkpoint mechanism. If it's true, Executor will - # check if there are existing checkpoints first and try to load the - # checkpoints. If the checkpoints are loaded successfully, ops that - # have been processed will be skipped. - self.process_list = self.cfg.process - if self.cfg.use_checkpoint: - logger.info('Preparing checkpoint manager...') - self.ckpt_dir = os.path.join(self.work_dir, 'ckpt') - self.ckpt_manager = CheckpointManager(self.ckpt_dir, - self.process_list, - self.cfg.np) - if self.ckpt_manager.ckpt_available: - logger.info('Found existed dataset checkpoint.') - self.process_list = self.ckpt_manager.get_left_process_list() - self.cfg.process = self.process_list - - # prepare exporter and check export path suffix - logger.info('Preparing exporter...') - self.exporter = Exporter( - self.cfg.export_path, - self.cfg.export_shard_size, - self.cfg.export_in_parallel, - self.cfg.np, - keep_stats_in_res_ds=self.cfg.keep_stats_in_res_ds, - keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds) - - # setup tracer - self.open_tracer = self.cfg.open_tracer - if self.open_tracer: - logger.info('Preparing tracer...') - self.tracer = Tracer(self.work_dir, show_num=self.cfg.trace_num) - self.op_list_to_trace = self.cfg.op_list_to_trace - if len(self.cfg.op_list_to_trace) == 0: - logger.info('Trace for all ops.') - self.op_list_to_trace = set(OPERATORS.modules.keys())
- -
[docs] def sample_data(self, - dataset_to_sample: Dataset = None, - load_data_np=None, - sample_ratio: float = 1.0, - sample_algo: str = 'uniform', - **kwargs): - """ - Sample a subset from the given dataset. - - :param dataset_to_sample: Dataset to sample from. If None, will use - the formatter linked by the executor. Default is None. - :param load_data_np: number of workers when loading the dataset. - :param sample_ratio: The ratio of the sample size to the original - dataset size. Default is 1.0 (no sampling). - :param sample_algo: Sampling algorithm to use. Options are "uniform", - "frequency_specified_field_selector", or - "topk_specified_field_selector". - Default is "uniform". - :return: A sampled Dataset. - """ - # Determine the dataset to sample from - if dataset_to_sample is not None: - dataset = dataset_to_sample - elif self.cfg.use_checkpoint and self.ckpt_manager.ckpt_available: - logger.info('Loading dataset from checkpoint...') - dataset = self.ckpt_manager.load_ckpt() - elif hasattr(self, 'formatter'): - logger.info('Loading dataset from data formatter...') - if load_data_np is None: - load_data_np = self.cfg.np - dataset = self.formatter.load_dataset(load_data_np, self.cfg) - else: - raise ValueError('No dataset available to sample from.') - - # Perform sampling based on the specified algorithm - if sample_algo == 'uniform': - return MixtureFormatter.random_sample(dataset, sample_ratio) - elif sample_algo == 'frequency_specified_field_selector': - dj_op = FrequencySpecifiedFieldSelector(**kwargs) - return dj_op.process(dataset) - elif sample_algo == 'topk_specified_field_selector': - dj_op = TopkSpecifiedFieldSelector(**kwargs) - return dj_op.process(dataset) - else: - raise ValueError(f'Unsupported sample_algo: {sample_algo}')
- -
[docs] def run(self, load_data_np=None): - """ - Running the dataset process pipeline. - - :param load_data_np: number of workers when loading the dataset. - :return: processed dataset. - """ - # 1. format data - if self.cfg.use_checkpoint and self.ckpt_manager.ckpt_available: - logger.info('Loading dataset from checkpoint...') - dataset = self.ckpt_manager.load_ckpt() - else: - logger.info('Loading dataset from data formatter...') - if load_data_np is None: - load_data_np = self.cfg.np - dataset = self.formatter.load_dataset(load_data_np, self.cfg) - - # 2. extract processes - logger.info('Preparing process operators...') - self.process_list, self.ops = load_ops(self.cfg.process, - self.cfg.op_fusion) - - # 3. data process - # - If tracer is open, trace each op after it's processed - # - If checkpoint is open, clean the cache files after each process - logger.info('Processing data...') - tstart = time() - dataset = dataset.process(self.ops, - exporter=self.exporter, - checkpointer=self.ckpt_manager, - tracer=self.tracer) - tend = time() - logger.info(f'All OPs are done in {tend - tstart:.3f}s.') - - # 4. data export - logger.info('Exporting dataset to disk...') - try: - self.exporter.export(dataset) - except: # noqa: E722 - logger.error('An error occurred during exporting the processed ' - 'dataset.') - traceback.print_exc() - if self.cfg.use_checkpoint: - logger.info('Writing checkpoint of dataset processed by ' - 'last op...') - dataset.cleanup_cache_files() - self.ckpt_manager.save_ckpt(dataset) - # compress the last dataset after exporting - if self.cfg.use_cache and self.cfg.cache_compress: - from data_juicer.utils.compress import compress - compress(dataset) - return dataset
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/exporter.html b/_modules/data_juicer/core/exporter.html deleted file mode 100644 index 2a15f71c4..000000000 --- a/_modules/data_juicer/core/exporter.html +++ /dev/null @@ -1,380 +0,0 @@ - - - - - - data_juicer.core.exporter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.exporter

-import os
-from multiprocessing import Pool
-
-from loguru import logger
-
-from data_juicer.utils.constant import Fields, HashKeys
-
-
-
[docs]class Exporter: - """The Exporter class is used to export a dataset to files of specific - format.""" - - KiB = 2**10 # 1024 - MiB = 2**20 # 1024*1024 - GiB = 2**30 # 1024*1024*1024 - TiB = 2**40 # 1024*1024*1024*1024 - -
[docs] def __init__(self, - export_path, - export_shard_size=0, - export_in_parallel=True, - num_proc=1, - export_ds=True, - keep_stats_in_res_ds=False, - keep_hashes_in_res_ds=False, - export_stats=True): - """ - Initialization method. - - :param export_path: the path to export datasets. - :param export_shard_size: the size of each shard of exported - dataset. In default, it's 0, which means export the dataset - to a single file. - :param num_proc: number of process to export the dataset. - :param export_ds: whether to export the dataset contents. - :param keep_stats_in_res_ds: whether to keep stats in the result - dataset. - :param keep_hashes_in_res_ds: whether to keep hashes in the result - dataset. - :param export_stats: whether to export the stats of dataset. - """ - self.export_path = export_path - self.export_shard_size = export_shard_size - self.export_in_parallel = export_in_parallel - self.export_ds = export_ds - self.keep_stats_in_res_ds = keep_stats_in_res_ds - self.keep_hashes_in_res_ds = keep_hashes_in_res_ds - self.export_stats = export_stats - self.suffix = self._get_suffix(export_path) - self.num_proc = num_proc - self.max_shard_size_str = '' - - # get the string format of shard size - if self.export_shard_size // Exporter.TiB: - self.max_shard_size_str = '%.2f TiB' % (self.export_shard_size / - Exporter.TiB) - elif self.export_shard_size // Exporter.GiB: - self.max_shard_size_str = '%.2f GiB' % (self.export_shard_size / - Exporter.GiB) - elif self.export_shard_size // Exporter.MiB: - self.max_shard_size_str = '%.2f MiB' % (self.export_shard_size / - Exporter.MiB) - elif self.export_shard_size // Exporter.KiB: - self.max_shard_size_str = '%.2f KiB' % (self.export_shard_size / - Exporter.KiB) - else: - self.max_shard_size_str = '%.2f Bytes' % (self.export_shard_size) - - # we recommend users to set a shard size between MiB and TiB. - if 0 < self.export_shard_size < Exporter.MiB: - logger.warning(f'The export_shard_size [{self.max_shard_size_str}]' - f' is less than 1MiB. If the result dataset is too ' - f'large, there might be too many shard files to ' - f'generate.') - if self.export_shard_size >= Exporter.TiB: - logger.warning(f'The export_shard_size [{self.max_shard_size_str}]' - f' is larger than 1TiB. It might generate large ' - f'single shard file and make loading and exporting ' - f'slower.')
- - def _get_suffix(self, export_path): - """ - Get the suffix of export path and check if it's supported. - - We only support ["jsonl", "json", "parquet"] for now. - - :param export_path: the path to export datasets. - :return: the suffix of export_path. - """ - suffix = export_path.split('.')[-1].lower() - support_dict = self._router() - if suffix not in support_dict: - raise NotImplementedError(f'Suffix of export path [' - f'{export_path}] is not supported ' - f'for now. Only support ' - f'{list(support_dict.keys())}.') - return suffix - - def _export_impl(self, dataset, export_path, suffix, export_stats=True): - """ - Export a dataset to specific path. - - :param dataset: the dataset to export. - :param export_path: the path to export the dataset. - :param suffix: suffix of export path. - :param export_stats: whether to export stats of dataset. - :return: - """ - if Fields.stats in dataset.features and export_stats: - # export stats of datasets into a single file. - logger.info('Exporting computed stats into a single file...') - ds_stats = dataset.select_columns(Fields.stats) - stats_file = export_path.replace('.' + suffix, '_stats.jsonl') - Exporter.to_jsonl( - ds_stats, - stats_file, - num_proc=self.num_proc if self.export_in_parallel else 1) - - if self.export_ds: - # fetch the corresponding export method according to the suffix - if not self.keep_stats_in_res_ds: - extra_fields = {Fields.stats} - feature_fields = set(dataset.features.keys()) - removed_fields = extra_fields.intersection(feature_fields) - dataset = dataset.remove_columns(removed_fields) - if not self.keep_hashes_in_res_ds: - extra_fields = { - HashKeys.hash, - HashKeys.minhash, - HashKeys.simhash, - HashKeys.imagehash, - HashKeys.videohash, - } - feature_fields = set(dataset.features.keys()) - removed_fields = extra_fields.intersection(feature_fields) - dataset = dataset.remove_columns(removed_fields) - export_method = Exporter._router()[suffix] - if self.export_shard_size <= 0: - # export the whole dataset into one single file. - logger.info('Export dataset into a single file...') - export_method( - dataset, - export_path, - num_proc=self.num_proc if self.export_in_parallel else 1) - else: - # compute the dataset size and number of shards to split - if dataset._indices is not None: - dataset_nbytes = dataset.data.nbytes * len( - dataset._indices) / len(dataset.data) - else: - dataset_nbytes = dataset.data.nbytes - num_shards = int(dataset_nbytes / self.export_shard_size) + 1 - num_shards = min(num_shards, len(dataset)) - - # split the dataset into multiple shards - logger.info(f'Split the dataset to export into {num_shards} ' - f'shards. Size of each shard <= ' - f'{self.max_shard_size_str}') - shards = [ - dataset.shard(num_shards=num_shards, - index=i, - contiguous=True) for i in range(num_shards) - ] - len_num = len(str(num_shards)) + 1 - num_fmt = f'%0{len_num}d' - - # regard the export path as a directory and set file names for - # each shard - dirname = os.path.dirname(os.path.abspath(self.export_path)) - basename = os.path.basename(self.export_path).split('.')[0] - os.makedirs(dirname, exist_ok=True) - filenames = [ - os.path.join( - dirname, f'{basename}-{num_fmt % index}-of-' - f'{num_fmt % num_shards}' - f'.{self.suffix}') for index in range(num_shards) - ] - - # export dataset into multiple shards using multiprocessing - logger.info(f'Start to exporting to {num_shards} shards.') - pool = Pool(self.num_proc) - for i in range(num_shards): - pool.apply_async(export_method, - args=( - shards[i], - filenames[i], - )) - pool.close() - pool.join() - -
[docs] def export(self, dataset): - """ - Export method for a dataset. - - :param dataset: the dataset to export. - :return: - """ - self._export_impl(dataset, self.export_path, self.suffix, - self.export_stats)
- -
[docs] def export_compute_stats(self, dataset, export_path): - """ - Export method for saving compute status in filters - """ - keep_stats_in_res_ds = self.keep_stats_in_res_ds - self.keep_stats_in_res_ds = True - self._export_impl(dataset, - export_path, - self.suffix, - export_stats=False) - self.keep_stats_in_res_ds = keep_stats_in_res_ds
- -
[docs] @staticmethod - def to_jsonl(dataset, export_path, num_proc=1, **kwargs): - """ - Export method for jsonl target files. - - :param dataset: the dataset to export. - :param export_path: the path to store the exported dataset. - :param num_proc: the number of processes used to export the dataset. - :param kwargs: extra arguments. - :return: - """ - dataset.to_json(export_path, force_ascii=False, num_proc=num_proc)
- -
[docs] @staticmethod - def to_json(dataset, export_path, num_proc=1, **kwargs): - """ - Export method for json target files. - - :param dataset: the dataset to export. - :param export_path: the path to store the exported dataset. - :param num_proc: the number of processes used to export the dataset. - :param kwargs: extra arguments. - :return: - """ - dataset.to_json(export_path, - force_ascii=False, - num_proc=num_proc, - lines=False)
- -
[docs] @staticmethod - def to_parquet(dataset, export_path, **kwargs): - """ - Export method for parquet target files. - - :param dataset: the dataset to export. - :param export_path: the path to store the exported dataset. - :param kwargs: extra arguments. - :return: - """ - dataset.to_parquet(export_path)
- - # suffix to export method - @staticmethod - def _router(): - """ - A router from different suffixes to corresponding export methods. - - :return: A dict router. - """ - return { - 'jsonl': Exporter.to_jsonl, - 'json': Exporter.to_json, - 'parquet': Exporter.to_parquet, - }
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/tracer.html b/_modules/data_juicer/core/tracer.html deleted file mode 100644 index 9144b7360..000000000 --- a/_modules/data_juicer/core/tracer.html +++ /dev/null @@ -1,338 +0,0 @@ - - - - - - data_juicer.core.tracer — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.tracer

-import os
-
-import pandas as pd
-from datasets import Dataset
-from loguru import logger
-
-
-
[docs]class Tracer: - """ - The tracer to trace the sample changes before and after an operator - process. - - The comparison results will be stored in the work directory. - """ - -
[docs] def __init__(self, work_dir, show_num=10): - """ - Initialization method. - - :param work_dir: the work directory to store the comparison - results - :param show_num: the maximum number of samples to show in the - comparison result files. - """ - self.work_dir = os.path.join(work_dir, 'trace') - if not os.path.exists(self.work_dir): - os.makedirs(self.work_dir) - self.show_num = show_num
- -
[docs] def trace_mapper(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset, text_key: str): - """ - Compare datasets before and after a Mapper. - - This will mainly show the different sample pairs due to the - modification by the Mapper - - :param op_name: the op name of mapper - :param previous_ds: dataset before the mapper process - :param processed_ds: dataset processed by the mapper - :param text_key: which text_key to trace - :return: - """ - assert len(previous_ds) == len(processed_ds) - dif_dict = [] - num = 0 - - # Find different samples orderly between previous and processed - # datasets until the total number of found sample pairs is enough. - for i in range(len(previous_ds)): - previous_sample = previous_ds[i][text_key] - processed_sample = processed_ds[i][text_key] - if previous_sample != processed_sample: - dif_dict.append({ - 'original text': previous_sample, - 'processed_text': processed_sample, - }) - num += 1 - if num >= self.show_num: - break - - if len(dif_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(dif_dict) < self.show_num: - logger.warning(f'There are {len(dif_dict)} different samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'mapper-{op_name}.jsonl' - dif_df = pd.DataFrame(dif_dict) - dif_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_batch_mapper(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset, text_key: str): - """ - Compare datasets before and after a BatchMapper. - - This will mainly show the new samples augmented by the BatchMapper - - :param op_name: the op name of mapper - :param previous_ds: dataset before the mapper process - :param processed_ds: dataset processed by the mapper - :param text_key: which text_key to trace - :return: - """ - assert previous_ds[0][text_key] == processed_ds[0][text_key] - aug_dict = [] - - # Get the first samples - for i in range(len(processed_ds)): - processed_sample = processed_ds[i] - aug_dict.append(processed_sample) - if i + 1 >= self.show_num: - break - - if len(aug_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are ' - f'empty. Thus no comparison results would be ' - f'generated.') - return - elif len(aug_dict) < self.show_num: - logger.warning(f'There are only {len(aug_dict)} samples -- less ' - f'than expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'mapper-{op_name}.jsonl' - dif_df = pd.DataFrame(aug_dict) - dif_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_filter(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset): - """ - Compare datasets before and after a Filter. - - This will mainly show the filtered samples by the Filter - - :param op_name: the op name of filter - :param previous_ds: dataset before the filter process - :param processed_ds: dataset processed by the filter - :return: - """ - if len(previous_ds) == len(processed_ds): - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - - # get the number of filtered samples. - total_dif_num = len(previous_ds) - len(processed_ds) - # index of the current sample in the previous dataset - i = 0 - filter_dict = [] - # number of found filtered samples. It's the offset bewteen two - # datasets as well. - num = 0 - while i < len(previous_ds): - if i - num >= len(processed_ds) or \ - previous_ds[i] != processed_ds[i - num]: - # 1. If all samples in processed dataset are checked but there - # still some samples left in the previous dataset, all of these - # left samples are filtered. - # 2. If the corresponding samples in previous and processed - # datasets are different, samples in the previous dataset are - # filtered. - num += 1 - filter_dict.append(previous_ds[i]) - if num >= self.show_num or num >= total_dif_num: - # If the total number of found filtered samples is enough or we - # have found all filtered samples, just stop. - break - i += 1 - if len(filter_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(filter_dict) < self.show_num: - logger.warning(f'There are {len(filter_dict)} filtered samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'filter-{op_name}.jsonl' - filter_df = pd.DataFrame(filter_dict) - filter_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_deduplicator(self, op_name: str, dup_pairs: list): - """ - Compare datasets before and after a Deduplicator. - - This will mainly show the near-duplicate sample pairs extracted - by the Deduplicator. Different from the other two trace methods, - the trace process for deduplicator is embedded into the process - method of deduplicator, but the other two trace methods are - independent of the process method of mapper and filter operators - - :param op_name: the op name of deduplicator - :param dup_pairs: duplicate sample pairs obtained from - deduplicator - :return: - """ - if dup_pairs is None: - logger.warning(f'Op [{op_name}] does not generate dup_pairs ' - f'correctly, thus no comparison results can be ' - f'obtained from this op.') - return - if len(dup_pairs) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(dup_pairs) < self.show_num: - logger.warning(f'There are {len(dup_pairs)} filtered samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # reorganize the duplicate pairs - dup_dict = [] - for key in dup_pairs: - dup_dict.append({ - 'dup1': dup_pairs[key][0], - 'dup2': dup_pairs[key][1], - }) - - # export the tracer result. - res_name = f'duplicate-{op_name}.jsonl' - dup_df = pd.DataFrame(dup_dict) - dup_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/csv_formatter.html b/_modules/data_juicer/format/csv_formatter.html deleted file mode 100644 index 5cdde1f7b..000000000 --- a/_modules/data_juicer/format/csv_formatter.html +++ /dev/null @@ -1,140 +0,0 @@ - - - - - - data_juicer.format.csv_formatter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.csv_formatter

-from .formatter import FORMATTERS, LocalFormatter
-
-
-
[docs]@FORMATTERS.register_module() -class CsvFormatter(LocalFormatter): - """ - The class is used to load and format csv-type files. - - Default suffixes is `['.csv']` - """ - SUFFIXES = ['.csv'] - -
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param kwargs: extra args - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='csv', - **kwargs, - )
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/formatter.html b/_modules/data_juicer/format/formatter.html deleted file mode 100644 index a4ba32ec3..000000000 --- a/_modules/data_juicer/format/formatter.html +++ /dev/null @@ -1,440 +0,0 @@ - - - - - - data_juicer.format.formatter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.formatter

-import os
-from typing import List, Tuple, Union
-
-from datasets import Dataset, DatasetDict, concatenate_datasets, load_dataset
-from loguru import logger
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import (find_files_with_suffix,
-                                          is_absolute_path)
-from data_juicer.utils.registry import Registry
-
-FORMATTERS = Registry('Formatters')
-
-
-class BaseFormatter:
-    """Base class to load dataset."""
-
-    def load_dataset(self, *args) -> Dataset:
-        raise NotImplementedError
-
-
-
[docs]class LocalFormatter(BaseFormatter): - """The class is used to load a dataset from local files or local - directory.""" - -
[docs] def __init__( - self, - dataset_path: str, - type: str, - suffixes: Union[str, List[str], Tuple[str]] = None, - text_keys: List[str] = None, - add_suffix=False, - **kwargs, - ): - """ - Initialization method. - - :param dataset_path: path to a dataset file or a dataset - directory - :param type: a packaged dataset module type (json, csv, etc.) - :param suffixes: files with specified suffixes to be processed - :param text_keys: key names of field that stores sample - text. - :param add_suffix: whether to add the file suffix to dataset - meta info - :param kwargs: extra args - """ - self.type = type - self.kwargs = kwargs - self.text_keys = text_keys - self.data_files = find_files_with_suffix(dataset_path, suffixes) - self.add_suffix = add_suffix
- -
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: - """ - Load a dataset from dataset file or dataset directory, and unify its - format. - - :param num_proc: number of processes when loading the dataset - :param global_cfg: global cfg used in consequent processes, - :return: formatted dataset - """ - datasets = load_dataset(self.type, - data_files={ - key.strip('.'): self.data_files[key] - for key in self.data_files - }, - num_proc=num_proc, - **self.kwargs) - if self.add_suffix: - logger.info('Add suffix info into dataset...') - datasets = add_suffixes(datasets, num_proc) - else: - from data_juicer.core.data import NestedDataset - datasets = NestedDataset( - concatenate_datasets([ds for _, ds in datasets.items()])) - ds = unify_format(datasets, - text_keys=self.text_keys, - num_proc=num_proc, - global_cfg=global_cfg) - return ds
- - -
[docs]class RemoteFormatter(BaseFormatter): - """The class is used to load a dataset from repository of huggingface - hub.""" - -
[docs] def __init__(self, - dataset_path: str, - text_keys: List[str] = None, - **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param text_keys: key names of field that stores sample - text. - :param kwargs: extra args - """ - self.path = dataset_path - self.text_keys = text_keys - self.kwargs = kwargs
- -
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: - """ - Load a dataset from HuggingFace, and unify its format. - - :param num_proc: number of processes when loading the dataset - :param global_cfg: the global cfg used in consequent processes, - :return: formatted dataset - """ - ds = load_dataset(self.path, - split='train', - num_proc=num_proc, - **self.kwargs) - ds = unify_format(ds, - text_keys=self.text_keys, - num_proc=num_proc, - global_cfg=global_cfg) - return ds
- - -def add_suffixes(datasets: DatasetDict, num_proc: int = 1) -> Dataset: - """ - Add suffix filed to datasets. - - :param datasets: a DatasetDict object - :param num_proc: number of processes to add suffixes - :return: datasets with suffix features. - """ - logger.info('Add suffix column for dataset') - from data_juicer.core.data import add_same_content_to_new_column - for key, ds in datasets.items(): - if Fields.suffix not in ds.features: - datasets[key] = ds.map(add_same_content_to_new_column, - fn_kwargs={ - 'new_column_name': Fields.suffix, - 'initial_value': '.' + key - }, - num_proc=num_proc, - desc='Adding new column for suffix') - datasets = concatenate_datasets([ds for _, ds in datasets.items()]) - from data_juicer.core.data import NestedDataset - return NestedDataset(datasets) - - -def unify_format( - dataset: Dataset, - text_keys: Union[List[str], str] = 'text', - num_proc: int = 1, - global_cfg=None, -) -> Dataset: - """ - Get an unified internal format, conduct the following modifications. - - 1. check keys of dataset - - 2. filter out those samples with empty or None text - - :param dataset: input dataset - :param text_keys: original text key(s) of dataset. - :param num_proc: number of processes for mapping - :param global_cfg: the global cfg used in consequent processes, - since cfg.text_key may be modified after unifying - - :return: unified_format_dataset - """ - from data_juicer.core.data import NestedDataset - if isinstance(dataset, DatasetDict): - datasets = list(dataset.values()) - assert len(datasets) == 1, 'Please make sure the passed datasets ' \ - 'contains only 1 dataset' - dataset = datasets[0] - assert isinstance(dataset, Dataset) or \ - isinstance(dataset, NestedDataset), \ - 'Currently we only support processing data' \ - 'with huggingface-Dataset format' - - if text_keys is None: - text_keys = [] - - if isinstance(text_keys, str): - text_keys = [text_keys] - - logger.info('Unifying the input dataset formats...') - - dataset = NestedDataset(dataset) - - # 1. check text related keys - for key in text_keys: - if key not in dataset.features: - err_msg = f'There is no key [{key}] in dataset. You might set ' \ - f'wrong text_key in the config file for your dataset. ' \ - f'Please check and retry!' - logger.error(err_msg) - raise ValueError(err_msg) - - # 2. filter out those samples with empty or None text - # TODO: optimize the filtering operation for better efficiency - logger.info(f'There are {len(dataset)} sample(s) in the original dataset.') - - def non_empty_text(sample, target_keys): - for target_key in target_keys: - # TODO: case for CFT, in which the len(sample[target_key]) == 0 - if sample[target_key] is None: - # we filter out the samples contains at least None column - # since the op can not handle it now - return False - return True - - dataset = dataset.filter(non_empty_text, - num_proc=num_proc, - fn_kwargs={'target_keys': text_keys}) - logger.info(f'{len(dataset)} samples left after filtering empty text.') - - # 3. convert relative paths to absolute paths - if global_cfg: - ds_dir = global_cfg.dataset_dir - image_key = global_cfg.image_key - audio_key = global_cfg.audio_key - video_key = global_cfg.video_key - - data_path_keys = [] - if image_key in dataset.features: - data_path_keys.append(image_key) - if audio_key in dataset.features: - data_path_keys.append(audio_key) - if video_key in dataset.features: - data_path_keys.append(video_key) - if len(data_path_keys) == 0: - # no image/audio/video path list in dataset, no need to convert - return dataset - - if ds_dir == '': - return dataset - - logger.info('Converting relative paths in the dataset to their ' - 'absolute version. (Based on the directory of input ' - 'dataset file)') - - # function to convert relative paths to absolute paths - def rel2abs(sample, path_keys, dataset_dir): - for path_key in path_keys: - if path_key not in sample: - continue - paths = sample[path_key] - if not paths: - continue - new_paths = [ - path if os.path.isabs(path) else os.path.join( - dataset_dir, path) for path in paths - ] - sample[path_key] = new_paths - return sample - - dataset = dataset.map(rel2abs, - num_proc=num_proc, - fn_kwargs={ - 'path_keys': data_path_keys, - 'dataset_dir': ds_dir - }) - else: - logger.warning('No global config passed into unify_format function. ' - 'Relative paths in the dataset might not be converted ' - 'to their absolute versions. Data of other modalities ' - 'might not be able to find by Data-Juicer.') - - return dataset - - -def load_formatter(dataset_path, - text_keys=None, - suffixes=None, - add_suffix=False, - **kwargs) -> BaseFormatter: - """ - Load the appropriate formatter for different types of data formats. - - :param dataset_path: Path to dataset file or dataset directory - :param text_keys: key names of field that stores sample text. - Default: None - :param suffixes: the suffix of files that will be read. Default: - None - :return: a dataset formatter. - """ - - if suffixes is None: - suffixes = [] - ext_num = {} - if os.path.isdir(dataset_path) or os.path.isfile(dataset_path): - file_dict = find_files_with_suffix(dataset_path, suffixes) - if not file_dict: - raise IOError( - 'Unable to find files matching the suffix from {}'.format( - dataset_path)) - for ext in file_dict: - ext_num[ext] = len(file_dict[ext]) - - # local dataset - if ext_num: - formatter_num = {} - for name, formatter in FORMATTERS.modules.items(): - formatter_num[name] = 0 - for ext in ext_num: - if ext in formatter.SUFFIXES: - formatter_num[name] += ext_num[ext] - formatter = max(formatter_num, key=lambda x: formatter_num[x]) - target_suffixes = set(ext_num.keys()).intersection( - set(FORMATTERS.modules[formatter].SUFFIXES)) - return FORMATTERS.modules[formatter](dataset_path, - text_keys=text_keys, - suffixes=target_suffixes, - add_suffix=add_suffix, - **kwargs) - - # try huggingface dataset hub - elif not is_absolute_path(dataset_path) and dataset_path.count('/') <= 1: - return RemoteFormatter(dataset_path, text_keys=text_keys, **kwargs) - - # no data - else: - raise ValueError(f'Unable to load the dataset from [{dataset_path}]. ' - f'It might be because Data-Juicer doesn\'t support ' - f'the format of this dataset, or the path of this ' - f'dataset is incorrect.Please check if it\'s a valid ' - f'dataset path and retry.') -
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/json_formatter.html b/_modules/data_juicer/format/json_formatter.html deleted file mode 100644 index b48328799..000000000 --- a/_modules/data_juicer/format/json_formatter.html +++ /dev/null @@ -1,140 +0,0 @@ - - - - - - data_juicer.format.json_formatter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.json_formatter

-from .formatter import FORMATTERS, LocalFormatter
-
-
-
[docs]@FORMATTERS.register_module() -class JsonFormatter(LocalFormatter): - """ - The class is used to load and format json-type files. - - Default suffixes is `['.json', '.jsonl', '.jsonl.zst']` - """ - SUFFIXES = ['.json', '.jsonl', '.jsonl.zst'] - -
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param kwargs: extra args - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='json', - **kwargs, - )
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/load.html b/_modules/data_juicer/format/load.html deleted file mode 100644 index 7e11f690c..000000000 --- a/_modules/data_juicer/format/load.html +++ /dev/null @@ -1,141 +0,0 @@ - - - - - - data_juicer.format.load — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.load

-from .formatter import BaseFormatter
-from .mixture_formatter import MixtureFormatter
-
-
-
[docs]def load_formatter(dataset_path, - text_keys=None, - suffixes=[], - add_suffix=False, - **kwargs) -> BaseFormatter: - """ - Load mixture formatter for multiple different data formats with an optional - weight(default 1.0) according to their formats. - - :param dataset_path: path to a dataset file or a dataset directory - :param text_keys: key names of field that stores sample text. - Default: None - :param suffixes: files with specified suffixes to be processed. - :param add_suffix: whether to add the file suffix to dataset meta - info - :return: a dataset formatter. - """ - formatter = MixtureFormatter(dataset_path=dataset_path, - text_keys=text_keys, - suffixes=suffixes, - add_suffix=add_suffix, - **kwargs) - return formatter
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/mixture_formatter.html b/_modules/data_juicer/format/mixture_formatter.html deleted file mode 100644 index d6799e266..000000000 --- a/_modules/data_juicer/format/mixture_formatter.html +++ /dev/null @@ -1,258 +0,0 @@ - - - - - - data_juicer.format.mixture_formatter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.mixture_formatter

-from itertools import chain, repeat
-from typing import List, Tuple, Union
-
-import numpy as np
-from datasets import Dataset, concatenate_datasets
-from loguru import logger
-
-from .formatter import BaseFormatter, load_formatter
-
-
-
[docs]class MixtureFormatter(BaseFormatter): - """The class mixes multiple datasets by randomly selecting samples from - every dataset and merging them, and then exports the merged datasset as a - new mixed dataset.""" - -
[docs] def __init__(self, - dataset_path: str, - suffixes: Union[str, List[str], Tuple[str]] = None, - text_keys=None, - add_suffix=False, - max_samples=None, - **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset dir or a list - of them, optional weights, default 1.0 e.g. `<w1> ds.jsonl - <w2> ds_dir <w3> ds_file.json` - :param suffixes: files with specified suffixes to be processed - :param text_keys: key names of field that stores sample text. - :param add_suffix: whether to add the file suffix to dataset - meta info - :param max_samples: max samples number of mixed dataset. - :param kwargs: extra args - """ - - data_prefixes, weights = self._get_weight(data_prefix=dataset_path) - sample_numbers = [0] * len(weights) - if max_samples is not None: - # Normalize weights. - weights = np.array(weights, dtype=np.float64) - sum_weights = np.sum(weights) - assert sum_weights > 0.0 - weights /= sum_weights - sample_num_per_dataset = [ - int(np.ceil(max_samples * weight)) for weight in weights - ] - - # Adjust - acc_sample_numbers = 0 - for i in range(len(sample_num_per_dataset)): - sample_numbers[i] = min(sample_num_per_dataset[i], - max_samples - acc_sample_numbers) - acc_sample_numbers += sample_numbers[i] - - self.sample_numbers = sample_numbers - self.weights = weights - self.formatters = [ - load_formatter(dataset_path=data_prefix, - suffixes=suffixes, - text_keys=text_keys, - add_suffix=add_suffix, - **kwargs) for data_prefix in data_prefixes - ]
- - def _get_weight(self, data_prefix): - """ - Split every dataset path and its weight. - - :param data_prefix: a dataset file or a dataset dir or a list of - them, e.g. `<w1> ds1.jsonl <w2> ds2_dir <w3> ds3_file.json` - :return: list of dataset path and list of weights - """ - data_prefix = data_prefix.split() - weights = [] - prefixes = [] - - for i in range(len(data_prefix)): - try: - value = max(float(data_prefix[i]), 0.0) - weights.append(value) - except: # noqa: E722 - value = data_prefix[i].strip() - - # if not set weight, use 1.0 as default - if i == 0 or len(weights) == len(prefixes): - weights.append(1.0) - prefixes.append(value) - return prefixes, weights - -
[docs] @classmethod - def random_sample(cls, dataset, weight=1.0, sample_number=0, seed=None): - """ - Randomly sample a subset from a dataset with weight or number, - if sample number is bigger than 0, we will use sample - number instead of weight. - :param dataset: a HuggingFace dataset - :param weight: sample ratio of dataset - :param sample_number: sample number of dataset - :param seed: random sample seed, if None, 42 as default - :return: a subset of dataset - """ - if seed is None: - seed = 42 - - ds_samples = dataset.num_rows - if sample_number <= 0: - sample_number = int(np.ceil(ds_samples * weight)) - - if sample_number == ds_samples: - return dataset - - sample_index = range(sample_number) - - n_repeat = int(np.ceil(sample_number / ds_samples)) - 1 - if n_repeat > 0: - remain_samples = sample_number - n_repeat * ds_samples - sample_index = chain(*repeat(range(ds_samples), n_repeat), - range(remain_samples)) - - return dataset.shuffle(seed=seed).select(sample_index)
- -
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: - """ - Load a mixed dataset. - - :param num_proc: number of processes when loading the dataset - :param global_cfg: the global cfg used in consequent processes, - :return: mixed dataset - """ - dataset_list = [] - for weight, sample_num, formatter in zip(self.weights, - self.sample_numbers, - self.formatters): - dataset = formatter.load_dataset(num_proc, global_cfg) - sampled = self.random_sample(dataset, weight, sample_num) - logger.info(f'sampled {len(sampled)} from ' - f'{len(dataset)}') - dataset_list.append(sampled) - - from data_juicer.core.data import NestedDataset - mixed_dataset = NestedDataset(concatenate_datasets(dataset_list)) - logger.info(f'There are {len(mixed_dataset)} in final dataset') - return mixed_dataset
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/parquet_formatter.html b/_modules/data_juicer/format/parquet_formatter.html deleted file mode 100644 index c0db08d89..000000000 --- a/_modules/data_juicer/format/parquet_formatter.html +++ /dev/null @@ -1,140 +0,0 @@ - - - - - - data_juicer.format.parquet_formatter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.parquet_formatter

-from .formatter import FORMATTERS, LocalFormatter
-
-
-
[docs]@FORMATTERS.register_module() -class ParquetFormatter(LocalFormatter): - """ - The class is used to load and format parquet-type files. - - Default suffixes is `['.parquet']` - """ - SUFFIXES = ['.parquet'] - -
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param kwargs: extra args - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='parquet', - **kwargs, - )
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/text_formatter.html b/_modules/data_juicer/format/text_formatter.html deleted file mode 100644 index 2938a3bc1..000000000 --- a/_modules/data_juicer/format/text_formatter.html +++ /dev/null @@ -1,273 +0,0 @@ - - - - - - data_juicer.format.text_formatter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.text_formatter

-import os
-from multiprocessing import Pool
-
-import pdfplumber
-from datasets import Dataset, concatenate_datasets, load_dataset
-from docx import Document
-from loguru import logger
-
-from data_juicer.utils.cache_utils import DATA_JUICER_CACHE_HOME
-from data_juicer.utils.file_utils import find_files_with_suffix
-
-from .formatter import FORMATTERS, LocalFormatter, add_suffixes, unify_format
-
-
-def extract_txt_from_docx(fn, tgt_path):
-    """
-    Extract text from a docx file and save to target path.
-
-    :param fn: path to input pdf file
-    :param tgt_path: path to save text file.
-    """
-    doc = Document(fn)
-    text = [para.text for para in doc.paragraphs if para.text.strip()]
-    base_fn = os.path.basename(fn).lower().replace('.docx', '.txt')
-    with open(os.path.join(tgt_path, base_fn), 'w') as f:
-        f.write('\n'.join(text))
-
-
-def extract_txt_from_pdf(fn, tgt_path):
-    """
-    Extract text from a pdf file and save to target path.
-
-    :param fn: path to input pdf file
-    :param tgt_path: path to save text file.
-    """
-    with pdfplumber.open(fn) as pdf:
-        text = []
-        for page in pdf.pages:
-            # remove tables from each page extracted by pdfplumber
-            tables = page.find_tables()
-            for table in tables:
-                page = page.outside_bbox(table.bbox)
-            # remove page number from the end of each page
-            page_text = page.extract_text()
-            page_num = str(page.page_number)
-            if page_text.rstrip().endswith(page_num):
-                page_text = page_text.rstrip()[:-len(page_num)]
-            if page_text.strip():
-                text.append(page_text)
-        base_fn = os.path.basename(fn).lower().replace('.pdf', '.txt')
-        with open(os.path.join(tgt_path, base_fn), 'w') as f:
-            f.write('\n'.join(text))
-
-
-
[docs]@FORMATTERS.register_module() -class TextFormatter(LocalFormatter): - """ - The class is used to load and format text-type files. - - e.g. `['.txt', '.pdf', '.cpp', '.docx']` - """ - - SUFFIXES = [ - '.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', - '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', - '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', - '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', - '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', - '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', - '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', - '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', - '.m', '.smali' - ] - -
[docs] def __init__(self, - dataset_path, - suffixes=None, - add_suffix=False, - **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param add_suffix: Whether to add file suffix to datase meta - info - :param kwargs: extra args - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='text', - add_suffix=add_suffix, - **kwargs, - ) - self.dataset_path = dataset_path - self.add_suffix = add_suffix
- -
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: - """ - Load a dataset from local text-type files. - - :param num_proc: number of processes when loading the dataset - :param global_cfg: the global cfg used in consequent processes, - :return: unified_format_dataset. - """ - # extract text to cache directory - extracted_dataset_path = os.path.join( - DATA_JUICER_CACHE_HOME, - os.path.basename(os.path.abspath(self.dataset_path))) - - for file_type in self.data_files: - - # extract text from docx or pdf files, and save as txt type - if file_type == '.docx' or file_type == '.pdf': - extracted_filetype_path = os.path.join(extracted_dataset_path, - file_type.strip('.')) - if not os.path.exists(extracted_filetype_path): - os.makedirs(extracted_filetype_path) - logger.info('Extracting text from {} files...'.format( - file_type.strip('.'))) - - extract_func = extract_txt_from_docx \ - if file_type == '.docx' else extract_txt_from_pdf - pool = Pool(num_proc) - for data_file in self.data_files[file_type]: - pool.apply_async(func=extract_func, - args=( - data_file, - extracted_filetype_path, - )) - pool.close() - pool.join() - logger.info(f'Extracted text files are stored in directory ' - f'{extracted_filetype_path}') - - # look for extracted txt files - self.data_files[file_type] = find_files_with_suffix( - extracted_filetype_path, '.txt')['.txt'] - - # load text dataset, one text file as one sample - datasets = load_dataset('text', - data_files={ - key.strip('.'): self.data_files[key] - for key in self.data_files - }, - sample_by='document', - num_proc=num_proc, - **self.kwargs) - # whether to add file suffix to datase meta info - if self.add_suffix: - logger.info('Add suffix info into dataset...') - datasets = add_suffixes(datasets, num_proc) - else: - datasets = concatenate_datasets([ds for _, ds in datasets.items()]) - return unify_format(datasets, - text_keys=self.text_keys, - num_proc=num_proc, - global_cfg=global_cfg)
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/format/tsv_formatter.html b/_modules/data_juicer/format/tsv_formatter.html deleted file mode 100644 index 53cf7320d..000000000 --- a/_modules/data_juicer/format/tsv_formatter.html +++ /dev/null @@ -1,141 +0,0 @@ - - - - - - data_juicer.format.tsv_formatter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.format.tsv_formatter

-from .formatter import FORMATTERS, LocalFormatter
-
-
-
[docs]@FORMATTERS.register_module() -class TsvFormatter(LocalFormatter): - """ - The class is used to load and format tsv-type files. - - Default suffixes is `['.tsv']` - """ - SUFFIXES = ['.tsv'] - -
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): - """ - Initialization method. - - :param dataset_path: a dataset file or a dataset directory - :param suffixes: files with specified suffixes to be processed - :param kwargs: extra args, e.g. `delimiter = ','` - """ - super().__init__( - dataset_path=dataset_path, - suffixes=suffixes if suffixes else self.SUFFIXES, - type='csv', - delimiter='\t', - **kwargs, - )
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html index 3b8220535..73e9dee42 100644 --- a/_modules/index.html +++ b/_modules/index.html @@ -85,19 +85,6 @@

All modules for which code is available

  • data_juicer.analysis.diversity_analysis
  • data_juicer.analysis.overall_analysis
  • data_juicer.config.config
  • -
  • data_juicer.core.analyzer
  • -
  • data_juicer.core.data
  • -
  • data_juicer.core.executor
  • -
  • data_juicer.core.exporter
  • -
  • data_juicer.core.tracer
  • -
  • data_juicer.format.csv_formatter
  • -
  • data_juicer.format.formatter
  • -
  • data_juicer.format.json_formatter
  • -
  • data_juicer.format.load
  • -
  • data_juicer.format.mixture_formatter
  • -
  • data_juicer.format.parquet_formatter
  • -
  • data_juicer.format.text_formatter
  • -
  • data_juicer.format.tsv_formatter
  • data_juicer.ops.base_op
  • data_juicer.ops.common.helper_func
  • data_juicer.ops.deduplicator.document_deduplicator
  • diff --git a/data_juicer.core.html b/data_juicer.core.html index 52a6111fa..551c69a6b 100644 --- a/data_juicer.core.html +++ b/data_juicer.core.html @@ -47,14 +47,7 @@ @@ -93,337 +82,8 @@
    -
    -

    data_juicer.format

    -
    -
    -data_juicer.format.load_formatter(dataset_path, text_keys=None, suffixes=[], add_suffix=False, **kwargs) BaseFormatter[source]
    -

    Load mixture formatter for multiple different data formats with an optional -weight(default 1.0) according to their formats.

    -
    -
    Parameters:
    -
      -
    • dataset_path – path to a dataset file or a dataset directory

    • -
    • text_keys – key names of field that stores sample text. -Default: None

    • -
    • suffixes – files with specified suffixes to be processed.

    • -
    • add_suffix – whether to add the file suffix to dataset meta -info

    • -
    -
    -
    Returns:
    -

    a dataset formatter.

    -
    -
    -
    - -
    -
    -class data_juicer.format.JsonFormatter(dataset_path, suffixes=None, **kwargs)[source]
    -

    Bases: LocalFormatter

    -

    The class is used to load and format json-type files.

    -

    Default suffixes is [‘.json’, ‘.jsonl’, ‘.jsonl.zst’]

    -
    -
    -SUFFIXES = ['.json', '.jsonl', '.jsonl.zst']
    -
    - -
    -
    -__init__(dataset_path, suffixes=None, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • dataset_path – a dataset file or a dataset directory

    • -
    • suffixes – files with specified suffixes to be processed

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    - -
    -
    -class data_juicer.format.LocalFormatter(dataset_path: str, type: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]
    -

    Bases: BaseFormatter

    -

    The class is used to load a dataset from local files or local -directory.

    -
    -
    -__init__(dataset_path: str, type: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • dataset_path – path to a dataset file or a dataset -directory

    • -
    • type – a packaged dataset module type (json, csv, etc.)

    • -
    • suffixes – files with specified suffixes to be processed

    • -
    • text_keys – key names of field that stores sample -text.

    • -
    • add_suffix – whether to add the file suffix to dataset -meta info

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]
    -

    Load a dataset from dataset file or dataset directory, and unify its -format.

    -
    -
    Parameters:
    -
      -
    • num_proc – number of processes when loading the dataset

    • -
    • global_cfg – global cfg used in consequent processes,

    • -
    -
    -
    Returns:
    -

    formatted dataset

    -
    -
    -
    - -
    - -
    -
    -class data_juicer.format.RemoteFormatter(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]
    -

    Bases: BaseFormatter

    -

    The class is used to load a dataset from repository of huggingface -hub.

    -
    -
    -__init__(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • dataset_path – a dataset file or a dataset directory

    • -
    • text_keys – key names of field that stores sample -text.

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]
    -

    Load a dataset from HuggingFace, and unify its format.

    -
    -
    Parameters:
    -
      -
    • num_proc – number of processes when loading the dataset

    • -
    • global_cfg – the global cfg used in consequent processes,

    • -
    -
    -
    Returns:
    -

    formatted dataset

    -
    -
    -
    - -
    - -
    -
    -class data_juicer.format.TextFormatter(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]
    -

    Bases: LocalFormatter

    -

    The class is used to load and format text-type files.

    -

    e.g. [‘.txt’, ‘.pdf’, ‘.cpp’, ‘.docx’]

    -
    -
    -SUFFIXES = ['.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', '.m', '.smali']
    -
    - -
    -
    -__init__(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • dataset_path – a dataset file or a dataset directory

    • -
    • suffixes – files with specified suffixes to be processed

    • -
    • add_suffix – Whether to add file suffix to datase meta -info

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]
    -

    Load a dataset from local text-type files.

    -
    -
    Parameters:
    -
      -
    • num_proc – number of processes when loading the dataset

    • -
    • global_cfg – the global cfg used in consequent processes,

    • -
    -
    -
    Returns:
    -

    unified_format_dataset.

    -
    -
    -
    - -
    - -
    -
    -class data_juicer.format.ParquetFormatter(dataset_path, suffixes=None, **kwargs)[source]
    -

    Bases: LocalFormatter

    -

    The class is used to load and format parquet-type files.

    -

    Default suffixes is [‘.parquet’]

    -
    -
    -SUFFIXES = ['.parquet']
    -
    - -
    -
    -__init__(dataset_path, suffixes=None, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • dataset_path – a dataset file or a dataset directory

    • -
    • suffixes – files with specified suffixes to be processed

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    - -
    -
    -class data_juicer.format.CsvFormatter(dataset_path, suffixes=None, **kwargs)[source]
    -

    Bases: LocalFormatter

    -

    The class is used to load and format csv-type files.

    -

    Default suffixes is [‘.csv’]

    -
    -
    -SUFFIXES = ['.csv']
    -
    - -
    -
    -__init__(dataset_path, suffixes=None, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • dataset_path – a dataset file or a dataset directory

    • -
    • suffixes – files with specified suffixes to be processed

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    - -
    -
    -class data_juicer.format.TsvFormatter(dataset_path, suffixes=None, **kwargs)[source]
    -

    Bases: LocalFormatter

    -

    The class is used to load and format tsv-type files.

    -

    Default suffixes is [‘.tsv’]

    -
    -
    -SUFFIXES = ['.tsv']
    -
    - -
    -
    -__init__(dataset_path, suffixes=None, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • dataset_path – a dataset file or a dataset directory

    • -
    • suffixes – files with specified suffixes to be processed

    • -
    • kwargs – extra args, e.g. delimiter = ‘,’

    • -
    -
    -
    -
    - -
    - -
    -
    -class data_juicer.format.MixtureFormatter(dataset_path: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys=None, add_suffix=False, max_samples=None, **kwargs)[source]
    -

    Bases: BaseFormatter

    -

    The class mixes multiple datasets by randomly selecting samples from -every dataset and merging them, and then exports the merged datasset as a -new mixed dataset.

    -
    -
    -__init__(dataset_path: str, suffixes: str | List[str] | Tuple[str] | None = None, text_keys=None, add_suffix=False, max_samples=None, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • dataset_path – a dataset file or a dataset dir or a list -of them, optional weights, default 1.0 e.g. <w1> ds.jsonl -<w2> ds_dir <w3> ds_file.json

    • -
    • suffixes – files with specified suffixes to be processed

    • -
    • text_keys – key names of field that stores sample text.

    • -
    • add_suffix – whether to add the file suffix to dataset -meta info

    • -
    • max_samples – max samples number of mixed dataset.

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -classmethod random_sample(dataset, weight=1.0, sample_number=0, seed=None)[source]
    -

    Randomly sample a subset from a dataset with weight or number, -if sample number is bigger than 0, we will use sample -number instead of weight. -:param dataset: a HuggingFace dataset -:param weight: sample ratio of dataset -:param sample_number: sample number of dataset -:param seed: random sample seed, if None, 42 as default -:return: a subset of dataset

    -
    - -
    -
    -load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]
    -

    Load a mixed dataset.

    -
    -
    Parameters:
    -
      -
    • num_proc – number of processes when loading the dataset

    • -
    • global_cfg – the global cfg used in consequent processes,

    • -
    -
    -
    Returns:
    -

    mixed dataset

    -
    -
    -
    - -
    - +
    +

    data_juicer.format

    diff --git a/genindex.html b/genindex.html index 7dc29b637..0019e8f8d 100644 --- a/genindex.html +++ b/genindex.html @@ -91,8 +91,6 @@

    Index

    | F | G | I - | J - | K | L | M | N @@ -114,32 +112,6 @@

    _

  • (data_juicer.analysis.DiversityAnalysis method)
  • (data_juicer.analysis.OverallAnalysis method) -
  • -
  • (data_juicer.core.Analyzer method) -
  • -
  • (data_juicer.core.Executor method) -
  • -
  • (data_juicer.core.Exporter method) -
  • -
  • (data_juicer.core.NestedDataset method) -
  • -
  • (data_juicer.core.Tracer method) -
  • -
  • (data_juicer.format.CsvFormatter method) -
  • -
  • (data_juicer.format.JsonFormatter method) -
  • -
  • (data_juicer.format.LocalFormatter method) -
  • -
  • (data_juicer.format.MixtureFormatter method) -
  • -
  • (data_juicer.format.ParquetFormatter method) -
  • -
  • (data_juicer.format.RemoteFormatter method) -
  • -
  • (data_juicer.format.TextFormatter method) -
  • -
  • (data_juicer.format.TsvFormatter method)
  • (data_juicer.ops.Deduplicator method)
  • @@ -352,8 +324,6 @@

    _

    A

    @@ -549,20 +513,6 @@

    D

    -
  • - data_juicer.core - -
  • -
  • - data_juicer.format - -
  • @@ -586,8 +536,6 @@

    D

  • module
  • - - +
    -
  • Analyzer (class in data_juicer.core) +
  • AudioDurationFilter (class in data_juicer.ops.filter)
    • data_juicer.ops.filter @@ -595,6 +543,8 @@

      D

    • module
    +
    -

    J

    - - -
    - -

    K

    - - -
    -

    L

    @@ -789,8 +695,6 @@

    L

    M

      -
    • map() (data_juicer.core.NestedDataset method) -
    • Mapper (class in data_juicer.ops)
    • MaximumLineLengthFilter (class in data_juicer.ops.filter) @@ -798,10 +702,6 @@

      M

    • merge_config() (in module data_juicer.config)
    • merge_on_whitespace_tab_newline() (in module data_juicer.ops.common) -
    • -
    • MiB (data_juicer.core.Exporter attribute) -
    • -
    • MixtureFormatter (class in data_juicer.format)
    • module @@ -812,10 +712,6 @@

      M

    • data_juicer.analysis
    • data_juicer.config -
    • -
    • data_juicer.core -
    • -
    • data_juicer.format
    • data_juicer.ops
    • @@ -840,12 +736,10 @@

      M

      N

      @@ -862,17 +756,13 @@

      O

      P

        -
      • ParquetFormatter (class in data_juicer.format) -
      • PerplexityFilter (class in data_juicer.ops.filter)
      • PhraseGroundingRecallFilter (class in data_juicer.ops.filter)
      • -
      • process() (data_juicer.core.NestedDataset method) +
      • process() (data_juicer.ops.Deduplicator method)
          -
        • (data_juicer.ops.Deduplicator method) -
        • (data_juicer.ops.deduplicator.DocumentDeduplicator method)
        • (data_juicer.ops.deduplicator.DocumentMinhashDeduplicator method) @@ -1080,8 +970,6 @@

          P

          R

          - - - - - -
              data_juicer.config
              - data_juicer.core -
              - data_juicer.format -
              diff --git a/searchindex.js b/searchindex.js index 030a6872e..784bae1a6 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 7, 8, 9], "inform": [1, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": 2, "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": 2, "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": 2, "origin": [2, 3, 8, 9], "expect": [2, 9], "cfg_after_merg": 2, "thi": [3, 5, 6, 7, 8, 9, 10], "It": [3, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "op": [3, 13], "config": [3, 5, 13], "analysi": [3, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "input": [3, 5, 7, 8, 9, 10], "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "oper": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "which": [3, 5, 7, 8, 9], "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 8, 9], "0": [3, 4, 5, 7, 8, 9], "sample_algo": 3, "str": [3, 4, 6, 7, 8, 9, 10], "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "The": [3, 4, 5, 8, 9, 10], "ratio": [3, 4, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "A": [3, 5, 7, 9], "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "static": 3, "to_jsonl": 3, "jsonl": [3, 4], "target": [3, 8, 10], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "work_dir": 3, "show_num": [3, 5, 7], "10": [3, 8, 9], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "directori": [3, 4, 8], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "pair": [3, 5, 7, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "other": [3, 8, 9], "two": [3, 7, 8], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "dataset_path": 4, "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "accord": [4, 5, 8, 9], "kei": [4, 5, 8, 9, 10], "field": [4, 5, 7, 8, 9, 10], "specifi": [4, 6, 8, 9, 10], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "tupl": [4, 8], "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 8, 9], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "py": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "t": [4, 6, 7], "tsx": 4, "vb": 4, "makefil": 4, "xml": 4, "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8], "datasset": 4, "dir": 4, "w1": 4, "d": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "max": [4, 7, 8, 9], "random_sampl": 4, "sample_numb": 4, "seed": 4, "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "we": [4, 7, 8, 9, 13], "instead": [4, 6], "random": [4, 9, 10], "42": 4, "load_op": [5, 13], "process_list": 5, "op_fus": 5, "item": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "instanc": 5, "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stat": [5, 7, 8], "context": [5, 7, 8, 9], "metric": [5, 7, 8], "decid": [5, 7, 8], "var": [5, 7, 8], "temporarili": [5, 7, 8], "For": [5, 7, 8, 9], "level": [5, 6, 7, 8, 9, 10], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "need": [6, 8, 9, 10], "split": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "first": [6, 7, 8, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "now": [6, 9], "set": [6, 8, 9, 10], "contain": [6, 8, 9], "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "ad": [6, 9], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "2": [6, 8, 9], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "videodedupl": [7, 13], "consider_text": 7, "bool": [7, 8, 9, 10], "exact": 7, "match": [7, 8, 9], "consid": [7, 8, 9], "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "positiveint": [7, 8, 9, 10], "6380": 7, "basic": 7, "rai": 7, "although": 7, "implement": 7, "empty_hash_valu": 7, "empti": [7, 9], "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "documentminhashdedupl": [7, 13], "window_s": 7, "5": [7, 8, 9], "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "closedunitinterv": [7, 8, 9, 10], "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "won": 7, "kept": [7, 8], "final": [7, 9], "should": [7, 8, 9], "punctuat": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "pleas": 7, "provid": [7, 9], "shingl": 7, "ignor": [7, 9], "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "min": [7, 8, 9], "jaccard": 7, "similar": [7, 8, 9], "threshold": [7, 8, 9], "detect": [7, 8, 9], "regard": 7, "onli": [7, 8, 9], "band": 7, "lsh": 7, "determin": [7, 10], "optim": 7, "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "rayimagededupl": [7, 13], "phash": 7, "raydocumentdedupl": [7, 13], "ignore_non_charact": 7, "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentdedupl": [7, 13], "md5": 7, "imagededupl": [7, 13], "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "rayvideodedupl": [7, 13], "imagetextsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "min_scor": 8, "max_scor": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "any_or_al": [8, 9], "ani": [8, 9], "reduce_mod": 8, "avg": 8, "those": 8, "within": [8, 9, 10], "rang": [8, 9, 10], "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 10], "chunk": 8, "take": 8, "averag": 8, "rank": [8, 9, 10], "videoaspectratiofilt": [8, 13], "min_ratio": [8, 9], "9": [8, 9], "21": [8, 9], "max_ratio": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "minimum": [8, 9], "support": [8, 9], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "score": 8, "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "9223372036854775807": [8, 9], "total": [8, 9], "hug": 8, "face": [8, 9], "below": [8, 9], "exce": [8, 9], "textlengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "length": [8, 9], "specifiednumericfieldfilt": [8, 13], "field_kei": [8, 10], "min_valu": 8, "max_valu": 8, "numer": 8, "multi": [8, 10, 13], "specifiednumericfield": 8, "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "sy": 8, "maxsiz": 8, "iter": [8, 9], "videoaestheticsfilt": [8, 13], "hf_scorer_model": 8, "frame_sampling_method": [8, 9], "frame_num": [8, 9], "3": [8, 9], "aesthet": 8, "frame": [8, 9], "predictor": 8, "By": 8, "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "larg": 8, "while": 8, "usual": 8, "small": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "larger": [8, 9, 10], "addit": [8, 9], "durat": [8, 9], "must": [8, 9], "keyword": [8, 9], "perplexityfilt": [8, 13], "lang": [8, 9], "max_ppl": 8, "positivefloat": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": 8, "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "confid": 8, "area": 8, "out": 8, "account": 8, "more": [8, 9, 13], "maximumlinelengthfilt": [8, 13], "averagelinelengthfilt": [8, 13], "specifiedfieldfilt": [8, 13], "target_valu": 8, "retain": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "shift": 8, "found": [8, 9], "http": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "requir": 8, "equal": [8, 9, 10], "depend": [8, 9], "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "zh": 8, "mini_dependency_num": 8, "edg": 8, "objet": 8, "videoresolutionfilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "resolut": [8, 9], "alphanumericfilt": [8, 13], "25": 8, "count": 8, "alphanumer": 8, "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "8": [8, 9], "watermark": [8, 9], "high": 8, "probabl": [8, 9], "imageaestheticsfilt": [8, 13], "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "mb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "home": 8, "runner": 8, "asset": 8, "what": 8, "adopt": 8, "avail": 8, "join": 8, "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "imageshapefilt": [8, 13], "shape": 8, "width": [8, 9], "height": [8, 9], "videodurationfilt": [8, 13], "min_dur": 8, "nonnegativefloat": [8, 9], "max_dur": 8, "second": [8, 9], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "mini_action_num": 8, "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "here": [8, 9, 13], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videonsfwfilt": [8, 13], "specialcharactersfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "relat": 8, "exampl": 8, "chineseclip": 8, "might": [8, 9], "choic": 8, "imageaspectratiofilt": [8, 13], "333": 8, "audiodurationfilt": [8, 13], "nonnegativeint": [8, 9], "languageidscorefilt": [8, 13], "identif": 8, "suffixfilt": [8, 13], "imagesizefilt": [8, 13], "videowatermarkfilt": [8, 13], "wordsnumfilt": [8, 13], "imagefaceratiofilt": [8, 13], "largest": [8, 10], "flaggedwordfilt": [8, 13], "045": 8, "flagged_words_dir": 8, "flag": 8, "flagged_word": 8, "wordrepetitionfilt": [8, 13], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "sequenc": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "opencv": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "videocaptioningfromaudiomapp": [9, 13], "keep_original_sampl": 9, "caption": 9, "stream": 9, "qwen": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "spectrogram": 9, "transform": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "temperatur": 9, "system_prompt": 9, "user_prompt": 9, "user_prompt_kei": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "request": 9, "control": 9, "output": 9, "prompt": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "respons": 9, "guid": 9, "uers_prompt_kei": 9, "punctuationnormalizationmapp": [9, 13], "unicod": 9, "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "latex": 9, "sentencesplitmapp": [9, 13], "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "cut": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "pass": 9, "progress": 9, "cleanipmapp": [9, 13], "repl": 9, "clean": 9, "ipv4": 9, "ipv6": 9, "address": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "begin": 9, "drop": 9, "removetabletextmapp": [9, 13], "min_col": 9, "from_2_to_20": 9, "max_col": 9, "20": 9, "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "except": 9, "letter": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "hf_img2seq": 9, "blip2": 9, "opt": 9, "7b": 9, "point": 9, "fp16": 9, "bf16": 9, "version": 9, "branch": 9, "commit": 9, "id": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "enabl": 9, "produc": 9, "keep_candidate_mod": 9, "caption_num": 9, "candid": 9, "random_ani": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "batch": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "kernel": 9, "videoffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "nlpcdazhmapp": [9, 13], "sequenti": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "delete_random_char": 9, "swap_random_char": 9, "replace_equivalent_num": 9, "simpli": 9, "nlpcda": 9, "librari": 9, "you": 9, "time": 9, "semant": 9, "significantli": 9, "notic": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "delet": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "swap": 9, "contigu": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "imageblurmapp": [9, 13], "p": 9, "blure": 9, "cleancopyrightmapp": [9, 13], "copyright": 9, "comment": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "abov": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "cleanhtmlmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "videotaggingfromframesmapp": [9, 13], "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "extractqamapp": [9, 13], "hf_model": 9, "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "qa_format": 9, "chatml": 9, "question": 9, "answer": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "train": 9, "suitabl": 9, "hugginfac": 9, "interfac": 9, "follow": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imagecaptioningmapp": [9, 13], "prompt_kei": 9, "anoth": 9, "how": 9, "mani": 9, "similar_on": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "fixunicodemapp": [9, 13], "fix": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "nlpaugenmapp": [9, 13], "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "insert_random_char": 9, "nlpaug": 9, "love": 9, "llm": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "videocaptioningfromframesmapp": [9, 13], "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "cleanemailmapp": [9, 13], "email": 9, "replacecontentmapp": [9, 13], "design": 9, "audioffmpegwrappedmapp": [9, 13], "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videofaceblurmapp": [9, 13], "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "see": 13, "detail": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractQAMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExtractQAMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process"]], "process() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process"]], "process() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process"]], "process() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process"]], "process() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process"]], "process() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process"]], "process() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process"]], "process() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process"]], "process() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process"]], "process() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process"]], "process() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process"]], "process() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process"]], "process() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process"]], "process() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process"]], "process() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process"]], "process() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process"]], "process() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process"]], "process() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process"]], "process() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process"]], "process() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process"]], "process() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process"]], "process() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process"]], "process() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process"]], "process() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process"]], "process() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process"]], "process() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process"]], "process() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process"]], "process() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process"]], "process() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process"]], "process() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process"]], "process() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process"]], "process() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process"]], "process() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process"]], "process() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process"]], "process() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process"]], "process() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process"]], "process() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process"]], "process() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractQAMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "process() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process"]], "process() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process"]], "process() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process"]], "process() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process"]], "process() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process"]], "process() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process"]], "process() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process"]], "process() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.process"]], "process() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process"]], "process() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process"]], "process() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process"]], "process() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process"]], "process() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process"]], "process() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process"]], "process() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process"]], "process() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process"]], "process() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process"]], "process() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process"]], "process() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process"]], "process() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process"]], "process() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process"]], "process() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process"]], "process() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process"]], "process() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process"]], "process() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process"]], "process() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 13], "dataset": [1, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 5, 6, 7, 8, 9, 10], "base": [1, 5, 7, 8, 9, 10], "object": [1, 2, 8], "appli": [1, 7, 9, 10], "each": [1, 5, 7, 9], "column": [1, 9], "stat": [1, 5, 7, 8], "respect": [1, 9], "__init__": [1, 5, 7, 8, 9, 10], "initi": [1, 2, 7, 8, 9, 10], "method": [1, 6, 7, 8, 9, 10], "paramet": [1, 2, 5, 6, 7, 8, 9, 10], "analyz": [1, 2], "path": [1, 2, 7], "store": [1, 5, 7, 8, 9], "result": [1, 8], "option": 1, "precomput": 1, "overal": 1, "whether": [1, 2, 5, 6, 7, 8, 9], "save": [1, 2], "all": [1, 6, 8, 9], "figur": [1, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 5, 6, 7, 8, 9], "show": [1, 9], "skip_export": 1, "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 7, 8, 9], "": [1, 7, 8, 9], "sever": [1, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 9], "singl": [1, 9], "window": [1, 7], "after": [1, 6, 7, 8, 9], "disk": 1, "return": [1, 2, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 7, 8, 9], "inform": [1, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 5, 7, 8, 9], "param": [1, 2, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 5, 6, 7, 8, 9, 13], "load": [1, 5, 9], "comput": [1, 5, 6, 7, 8], "column_nam": 1, "text": [1, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": 1, "default": [1, 2, 7, 8, 9], "argument": [1, 5, 8, 9], "overallanalysi": [1, 13], "mean": [1, 9], "std": 1, "etc": 1, "refine_single_column": 1, "col": 1, "num_proc": 1, "1": [1, 8, 9], "describ": 1, "panda": 1, "number": [1, 5, 7, 8, 9, 10], "process": [1, 5, 6, 7, 8, 9, 10, 13], "export": [1, 5], "init_config": [2, 13], "arg": [2, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 9], "yaml": 2, "json": [2, 8], "jsonnet": 2, "superset": 2, "environ": 2, "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 5, 6, 8, 9], "e": [2, 8, 9], "g": [2, 9], "conifg": 2, "cfg": 2, "defaut": 2, "global": [2, 9], "executor": 2, "export_config": [2, 13], "format": [2, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 5, 7, 8, 9, 10], "i": [2, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 6, 7, 8], "__path__": 2, "meta": 2, "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 9], "merg": [2, 6, 8], "configur": 2, "origin": [2, 8, 9], "expect": [2, 9], "cfg_after_merg": 2, "load_op": [5, 13], "process_list": 5, "op_fus": 5, "accord": [5, 8, 9], "config": [5, 13], "A": [5, 7, 9], "item": 5, "its": [5, 7, 9], "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "The": [5, 8, 9, 10], "instanc": 5, "filter": [5, 7, 9, 13], "kwarg": [5, 7, 8, 9, 10], "remov": [5, 6, 8, 9], "info": 5, "text_kei": 5, "kei": [5, 8, 9, 10], "field": [5, 7, 8, 9, 10], "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stat": [5, 7, 8], "context": [5, 7, 8, 9], "which": [5, 7, 8, 9], "metric": [5, 7, 8], "decid": [5, 7, 8], "thi": [5, 6, 7, 8, 9, 10], "input": [5, 7, 8, 9, 10], "var": [5, 7, 8], "temporarili": [5, 7, 8], "For": [5, 7, 8, 9], "level": [5, 6, 7, 8, 9, 10], "boolean": [5, 7, 8], "keep": [5, 7, 8, 9], "run": [5, 8, 9], "tracer": [5, 7], "mapper": [5, 13], "conduct": 5, "edit": 5, "dedupl": [5, 9, 13], "compute_hash": [5, 7], "hash": [5, 7], "show_num": [5, 7], "0": [5, 7, 8, 9], "doc": [5, 7], "trace": [5, 7], "when": [5, 7, 8, 9, 10], "open": [5, 7, 9], "duplic": [5, 7], "pair": [5, 7, 9], "selector": [5, 13], "select": [5, 8, 10], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "need": [6, 8, 9, 10], "split": [6, 9], "specifi": [6, 8, 9, 10], "splite": 6, "differ": [6, 7, 8, 9], "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "ratio": [6, 8, 9, 10], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "t": [6, 7], "obtain": 6, "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "first": [6, 7, 8, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "than": [6, 7, 8, 9, 10], "sinc": 6, "now": [6, 9], "set": [6, 8, 9, 10], "instead": 6, "str": [6, 7, 8, 9, 10], "contain": [6, 8, 9], "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "augment": [6, 8, 9], "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "size": [6, 7, 8, 9], "group": [6, 8], "ad": [6, 9], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "2": [6, 8, 9], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "videodedupl": [7, 13], "consider_text": 7, "bool": [7, 8, 9, 10], "exact": 7, "match": [7, 8, 9], "consid": [7, 8, 9], "togeth": [7, 9], "extra": [7, 8, 9, 10], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "positiveint": [7, 8, 9, 10], "6380": 7, "basic": 7, "rai": 7, "although": 7, "implement": 7, "empty_hash_valu": 7, "empti": [7, 9], "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "documentminhashdedupl": [7, 13], "window_s": 7, "5": [7, 8, 9], "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "closedunitinterv": [7, 8, 9, 10], "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "won": 7, "kept": [7, 8], "final": [7, 9], "It": [7, 8, 9], "should": [7, 8, 9], "punctuat": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "we": [7, 8, 9, 13], "recommend": [7, 9], "pleas": 7, "provid": [7, 9], "shingl": 7, "ignor": [7, 9], "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "min": [7, 8, 9], "jaccard": 7, "similar": [7, 8, 9], "threshold": [7, 8, 9], "detect": [7, 8, 9], "two": [7, 8], "regard": 7, "onli": [7, 8, 9], "them": [7, 8], "band": 7, "lsh": 7, "determin": [7, 10], "optim": 7, "algorithm": [7, 9], "minim": 7, "weight": [7, 9], "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "rayimagededupl": [7, 13], "phash": 7, "raydocumentdedupl": [7, 13], "ignore_non_charact": 7, "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentdedupl": [7, 13], "md5": 7, "imagededupl": [7, 13], "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "max": [7, 8, 9], "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "rayvideodedupl": [7, 13], "imagetextsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "min_scor": 8, "max_scor": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "any_or_al": [8, 9], "ani": [8, 9], "reduce_mod": 8, "avg": 8, "those": 8, "within": [8, 9, 10], "rang": [8, 9, 10], "huggingfac": [8, 9], "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 10], "chunk": 8, "take": 8, "averag": 8, "rank": [8, 9, 10], "videoaspectratiofilt": [8, 13], "min_ratio": [8, 9], "9": [8, 9], "21": [8, 9], "max_ratio": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "h": [8, 9], "minimum": [8, 9], "support": [8, 9], "maximum": [8, 9], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "score": 8, "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "10": [8, 9], "max_num": 8, "9223372036854775807": [8, 9], "total": [8, 9], "hug": 8, "face": [8, 9], "below": [8, 9], "exce": [8, 9], "textlengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "length": [8, 9], "specifiednumericfieldfilt": [8, 13], "field_kei": [8, 10], "min_valu": 8, "float": [8, 9], "max_valu": 8, "numer": 8, "target": [8, 10], "multi": [8, 10, 13], "specifiednumericfield": 8, "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "sy": 8, "maxsiz": 8, "iter": [8, 9], "videoaestheticsfilt": [8, 13], "hf_scorer_model": 8, "frame_sampling_method": [8, 9], "uniform": [8, 9], "frame_num": [8, 9], "3": [8, 9], "aesthet": 8, "frame": [8, 9], "predictor": 8, "By": 8, "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "extract": [8, 9], "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "can": [8, 9], "larg": 8, "while": 8, "usual": 8, "small": 8, "term": 8, "work": [8, 9], "middl": [8, 9], "last": [8, 9], "larger": [8, 9, 10], "addit": [8, 9], "other": [8, 9], "durat": [8, 9], "must": [8, 9], "keyword": [8, 9], "perplexityfilt": [8, 13], "lang": [8, 9], "max_ppl": 8, "positivefloat": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": 8, "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "confid": 8, "area": 8, "out": 8, "account": 8, "more": [8, 9, 13], "maximumlinelengthfilt": [8, 13], "averagelinelengthfilt": [8, 13], "specifiedfieldfilt": [8, 13], "target_valu": 8, "tupl": 8, "retain": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "given": [8, 9], "shift": 8, "found": [8, 9], "http": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "txt": 8, "noqa": 8, "e501": 8, "requir": 8, "equal": [8, 9, 10], "depend": [8, 9], "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "int": [8, 9], "identifi": [8, 9], "entiti": 8, "independ": [8, 9], "omit": 8, "zh": 8, "mini_dependency_num": 8, "edg": 8, "objet": 8, "videoresolutionfilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "resolut": [8, 9], "alphanumericfilt": [8, 13], "25": 8, "count": 8, "alphanumer": 8, "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "8": [8, 9], "watermark": [8, 9], "high": 8, "probabl": [8, 9], "imageaestheticsfilt": [8, 13], "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "mb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "home": 8, "runner": 8, "cach": 8, "asset": 8, "what": 8, "adopt": 8, "avail": 8, "directori": 8, "join": 8, "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "imageshapefilt": [8, 13], "shape": 8, "width": [8, 9], "height": [8, 9], "videodurationfilt": [8, 13], "min_dur": 8, "nonnegativefloat": [8, 9], "max_dur": 8, "second": [8, 9], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "mini_action_num": 8, "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "here": [8, 9, 13], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videonsfwfilt": [8, 13], "specialcharactersfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "relat": 8, "exampl": 8, "chineseclip": 8, "might": [8, 9], "better": 8, "choic": 8, "imageaspectratiofilt": [8, 13], "333": 8, "audiodurationfilt": [8, 13], "nonnegativeint": [8, 9], "languageidscorefilt": [8, 13], "identif": 8, "suffixfilt": [8, 13], "suffix": 8, "pdf": 8, "docx": 8, "imagesizefilt": [8, 13], "videowatermarkfilt": [8, 13], "wordsnumfilt": [8, 13], "imagefaceratiofilt": [8, 13], "largest": [8, 10], "flaggedwordfilt": [8, 13], "045": 8, "flagged_words_dir": 8, "flag": 8, "flagged_word": 8, "wordrepetitionfilt": [8, 13], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "sequenc": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "opencv": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "befor": 8, "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "videocaptioningfromaudiomapp": [9, 13], "keep_original_sampl": 9, "caption": 9, "stream": 9, "qwen": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "gener": 9, "spectrogram": 9, "transform": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "temperatur": 9, "system_prompt": 9, "user_prompt": 9, "user_prompt_kei": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "request": 9, "control": 9, "random": [9, 10], "output": 9, "prompt": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "respons": 9, "guid": 9, "uers_prompt_kei": 9, "punctuationnormalizationmapp": [9, 13], "unicod": 9, "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "latex": 9, "sentencesplitmapp": [9, 13], "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "cut": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "pass": 9, "progress": 9, "cleanipmapp": [9, 13], "repl": 9, "clean": 9, "ipv4": 9, "ipv6": 9, "address": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanlinksmapp": [9, 13], "link": 9, "ftp": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "begin": 9, "drop": 9, "removetabletextmapp": [9, 13], "min_col": 9, "from_2_to_20": 9, "max_col": 9, "20": 9, "tabl": 9, "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "except": 9, "letter": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "hf_img2seq": 9, "blip2": 9, "opt": 9, "7b": 9, "point": 9, "fp16": 9, "bf16": 9, "version": 9, "branch": 9, "commit": 9, "id": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "enabl": 9, "produc": 9, "keep_candidate_mod": 9, "caption_num": 9, "candid": 9, "random_ani": 9, "similar_one_simhash": 9, "most": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "batch": 9, "b": 9, "denot": 9, "m": 9, "2nb": 9, "nb": 9, "mnb": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "kernel": 9, "videoffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "new": 9, "jp2t": 9, "shinjitai": 9, "nlpcdazhmapp": [9, 13], "sequenti": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "delete_random_char": 9, "swap_random_char": 9, "replace_equivalent_num": 9, "simpli": 9, "nlpcda": 9, "librari": 9, "you": 9, "time": 9, "semant": 9, "chang": 9, "significantli": 9, "notic": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "delet": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "swap": 9, "contigu": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "imageblurmapp": [9, 13], "p": 9, "blure": 9, "cleancopyrightmapp": [9, 13], "copyright": 9, "comment": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "abov": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "cleanhtmlmapp": [9, 13], "html": 9, "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "videotaggingfromframesmapp": [9, 13], "removecommentsmapp": [9, 13], "doc_typ": 9, "tex": 9, "inlin": 9, "multilin": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "extractqamapp": [9, 13], "hf_model": 9, "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "qa_format": 9, "chatml": 9, "question": 9, "answer": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "train": 9, "suitabl": 9, "hugginfac": 9, "interfac": 9, "follow": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imagecaptioningmapp": [9, 13], "prompt_kei": 9, "anoth": 9, "how": 9, "mani": 9, "similar_on": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "fixunicodemapp": [9, 13], "fix": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "nlpaugenmapp": [9, 13], "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "insert_random_char": 9, "nlpaug": 9, "love": 9, "llm": 9, "simul": 9, "spell": 9, "randomli": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "videocaptioningfromframesmapp": [9, 13], "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "map": 9, "bigger": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "cleanemailmapp": [9, 13], "email": 9, "replacecontentmapp": [9, 13], "content": 9, "design": 9, "audioffmpegwrappedmapp": [9, 13], "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videofaceblurmapp": [9, 13], "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "order": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "see": 13, "detail": 13, "data_juic": 13, "core": 13, "op": 13, "common": 13, "analysi": 13, "index": 13, "modul": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [0, 3, 1, "", "cuda_device_count"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractQAMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExtractQAMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "data-juicer-core"]], "data_juicer.format": [[4, "data-juicer-format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process"]], "process() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process"]], "process() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process"]], "process() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process"]], "process() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process"]], "process() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process"]], "process() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process"]], "process() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process"]], "process() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process"]], "process() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process"]], "process() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process"]], "process() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process"]], "process() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process"]], "process() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process"]], "process() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process"]], "process() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process"]], "process() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process"]], "process() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process"]], "process() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process"]], "process() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process"]], "process() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process"]], "process() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process"]], "process() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process"]], "process() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process"]], "process() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process"]], "process() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process"]], "process() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process"]], "process() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process"]], "process() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process"]], "process() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process"]], "process() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process"]], "process() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process"]], "process() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process"]], "process() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process"]], "process() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process"]], "process() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process"]], "process() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process"]], "process() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractQAMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "process() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process"]], "process() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process"]], "process() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process"]], "process() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process"]], "process() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process"]], "process() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process"]], "process() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process"]], "process() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.process"]], "process() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process"]], "process() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process"]], "process() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process"]], "process() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process"]], "process() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process"]], "process() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process"]], "process() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process"]], "process() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process"]], "process() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process"]], "process() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process"]], "process() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process"]], "process() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process"]], "process() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process"]], "process() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process"]], "process() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process"]], "process() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process"]], "process() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process"]], "process() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file