diff --git a/.buildinfo b/.buildinfo index 551c2f31d..79cea3ca4 100644 --- a/.buildinfo +++ b/.buildinfo @@ -1,4 +1,4 @@ # Sphinx build info version 1 -# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done. -config: 4c41756fe8e61f33740e7839901685c2 +# This file records the configuration used when building these files. When it is not found, a full rebuild will be done. +config: c0644f6dcb399e85439a8e09f1d45c95 tags: 645f666f9bcd5a90fca523b33c5a78b7 diff --git a/_modules/data_juicer.html b/_modules/data_juicer.html index 09b2c941e..d387a7f39 100644 --- a/_modules/data_juicer.html +++ b/_modules/data_juicer.html @@ -1,18 +1,18 @@ - + - data_juicer — data_juicer 1.0.0 documentation + data_juicer — data_juicer 1.0.1 documentation - - - + + + @@ -76,7 +76,7 @@

Source code for data_juicer

-__version__ = '1.0.0'
+__version__ = '1.0.1'
 
 import os
 import subprocess
@@ -123,12 +123,18 @@ 

Source code for data_juicer

 _CUDA_DEVICE_COUNT = _cuda_device_count()
 
 
-
[docs]def cuda_device_count(): +
+[docs] +def cuda_device_count(): return _CUDA_DEVICE_COUNT
-
[docs]def is_cuda_available(): + +
+[docs] +def is_cuda_available(): return _CUDA_DEVICE_COUNT > 0
+
diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html index 06c048f1f..caa40b8d6 100644 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ b/_modules/data_juicer/analysis/column_wise_analysis.html @@ -1,18 +1,18 @@ - + - data_juicer.analysis.column_wise_analysis — data_juicer 1.0.0 documentation + data_juicer.analysis.column_wise_analysis — data_juicer 1.0.1 documentation - - - + + + @@ -130,10 +130,14 @@

Source code for data_juicer.analysis.column_wise_analysis

return int(now_row), int(now_col), grids -
[docs]class ColumnWiseAnalysis: +
+[docs] +class ColumnWiseAnalysis: """Apply analysis on each column of stats respectively.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, dataset, output_path, overall_result=None, @@ -160,7 +164,10 @@

Source code for data_juicer.analysis.column_wise_analysis

self.save_stats_in_one_file = save_stats_in_one_file
-
[docs] def analyze(self, show_percentiles=False, show=False, skip_export=False): + +
+[docs] + def analyze(self, show_percentiles=False, show=False, skip_export=False): """ Apply analysis and draw the analysis figure for stats. @@ -268,7 +275,10 @@

Source code for data_juicer.analysis.column_wise_analysis

# TODO: (fixme) the saved png sometime are blank plt.clf()
-
[docs] def draw_hist(self, ax, data, save_path, percentiles=None, show=False): + +
+[docs] + def draw_hist(self, ax, data, save_path, percentiles=None, show=False): """ Draw the histogram for the data. @@ -329,7 +339,10 @@

Source code for data_juicer.analysis.column_wise_analysis

# add a little rotation on labels of x axis to avoid overlapping ax.tick_params(axis='x', rotation=25)
-
[docs] def draw_box(self, ax, data, save_path, percentiles=None, show=False): + +
+[docs] + def draw_box(self, ax, data, save_path, percentiles=None, show=False): """ Draw the box plot for the data. @@ -375,7 +388,9 @@

Source code for data_juicer.analysis.column_wise_analysis

# if no showing, we need to clear this axes to avoid # accumulated overlapped figures in different draw_xxx function # calling - ax.clear()
+ ax.clear()
+
+
diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html index a6ff5fe2b..fa2276840 100644 --- a/_modules/data_juicer/analysis/diversity_analysis.html +++ b/_modules/data_juicer/analysis/diversity_analysis.html @@ -1,18 +1,18 @@ - + - data_juicer.analysis.diversity_analysis — data_juicer 1.0.0 documentation + data_juicer.analysis.diversity_analysis — data_juicer 1.0.1 documentation - - - + + + @@ -161,11 +161,15 @@

Source code for data_juicer.analysis.diversity_analysis

return df -
[docs]class DiversityAnalysis: +
+[docs] +class DiversityAnalysis: """Apply diversity analysis for each sample and get an overall analysis result.""" -
[docs] def __init__(self, dataset, output_path, lang_or_model='en'): +
+[docs] + def __init__(self, dataset, output_path, lang_or_model='en'): """Initialization method :param dataset: the dataset to be analyzed :param output_path: path to store the analysis results :param lang_or_model: the diversity model or a specific language used to load @@ -177,7 +181,10 @@

Source code for data_juicer.analysis.diversity_analysis

os.makedirs(self.output_path) self.lang_or_model = lang_or_model
-
[docs] def compute(self, lang_or_model=None, column_name='text'): + +
+[docs] + def compute(self, lang_or_model=None, column_name='text'): """ Apply lexical tree analysis on each sample. @@ -208,7 +215,10 @@

Source code for data_juicer.analysis.diversity_analysis

dataset = self.dataset.map(find_verb_noun) return pd.DataFrame(dataset)
-
[docs] def analyze(self, + +
+[docs] + def analyze(self, lang_or_model=None, column_name='text', postproc_func=get_diversity, @@ -234,7 +244,9 @@

Source code for data_juicer.analysis.diversity_analysis

df.to_csv(os.path.join(self.output_path, 'diversity.csv')) df.to_markdown(os.path.join(self.output_path, 'diversity.md')) - return df
+ return df
+
+
diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html index 8680711df..e8962830a 100644 --- a/_modules/data_juicer/analysis/overall_analysis.html +++ b/_modules/data_juicer/analysis/overall_analysis.html @@ -1,18 +1,18 @@ - + - data_juicer.analysis.overall_analysis — data_juicer 1.0.0 documentation + data_juicer.analysis.overall_analysis — data_juicer 1.0.1 documentation - - - + + + @@ -92,11 +92,15 @@

Source code for data_juicer.analysis.overall_analysis

return col_overall -
[docs]class OverallAnalysis: +
+[docs] +class OverallAnalysis: """Apply analysis on the overall stats, including mean, std, quantiles, etc.""" -
[docs] def __init__(self, dataset, output_path): +
+[docs] + def __init__(self, dataset, output_path): """ Initialization method. @@ -117,7 +121,10 @@

Source code for data_juicer.analysis.overall_analysis

# {numbers, string, list of one of before} self.supported_object_types = {str, list}
-
[docs] def refine_single_column(self, col): + +
+[docs] + def refine_single_column(self, col): if col.dtype != 'object': # not an object, return directly return col @@ -137,7 +144,10 @@

Source code for data_juicer.analysis.overall_analysis

col = col.explode().infer_objects() return col
-
[docs] def analyze(self, percentiles=[], num_proc=1, skip_export=False): + +
+[docs] + def analyze(self, percentiles=[], num_proc=1, skip_export=False): """ Apply overall analysis on the whole dataset based on the describe method of pandas. @@ -171,7 +181,9 @@

Source code for data_juicer.analysis.overall_analysis

overall.to_csv(os.path.join(self.output_path, 'overall.csv')) overall.to_markdown(os.path.join(self.output_path, 'overall.md')) - return overall
+ return overall
+
+
diff --git a/_modules/data_juicer/config/config.html b/_modules/data_juicer/config/config.html index 85efc839c..76ef5041a 100644 --- a/_modules/data_juicer/config/config.html +++ b/_modules/data_juicer/config/config.html @@ -1,18 +1,18 @@ - + - data_juicer.config.config — data_juicer 1.0.0 documentation + data_juicer.config.config — data_juicer 1.0.1 documentation - - - + + + @@ -102,7 +102,9 @@

Source code for data_juicer.config.config

 global_parser = None
 
 
-
[docs]def init_configs(args: Optional[List[str]] = None): +
+[docs] +def init_configs(args: Optional[List[str]] = None): """ initialize the jsonargparse parser and parse configs from one of: 1. POSIX-style commands line args; @@ -309,6 +311,12 @@

Source code for data_juicer.config.config

         help='The compression method of the cache file, which can be'
         'specified in ["gzip", "zstd", "lz4"]. If this parameter is'
         'None, the cache file will not be compressed.')
+    parser.add_argument(
+        '--open_monitor',
+        type=bool,
+        default=True,
+        help='Whether to open the monitor to trace resource utilization for '
+        'each OP during data processing. It\'s True in default.')
     parser.add_argument(
         '--use_checkpoint',
         type=bool,
@@ -433,6 +441,7 @@ 

Source code for data_juicer.config.config

         logger.error('Config initialization failed')
+ def update_ds_cache_dir_and_related_vars(new_ds_cache_path): from pathlib import Path @@ -759,7 +768,9 @@

Source code for data_juicer.config.config

     print(table)
 
 
-
[docs]def export_config(cfg: Namespace, +
+[docs] +def export_config(cfg: Namespace, path: str, format: str = 'yaml', skip_none: bool = True, @@ -801,7 +812,10 @@

Source code for data_juicer.config.config

     logger.info(f'Saved the configuration in {path}')
-
[docs]def merge_config(ori_cfg: Namespace, new_cfg: Namespace): + +
+[docs] +def merge_config(ori_cfg: Namespace, new_cfg: Namespace): """ Merge configuration from new_cfg into ori_cfg @@ -859,7 +873,10 @@

Source code for data_juicer.config.config

         logger.error('Config merge failed')
-
[docs]def prepare_side_configs(ori_config: Union[str, Namespace, Dict]): + +
+[docs] +def prepare_side_configs(ori_config: Union[str, Namespace, Dict]): """ parse the config if ori_config is a string of a config file path with yaml, yml or json format @@ -891,7 +908,10 @@

Source code for data_juicer.config.config

     return config
-
[docs]def get_init_configs(cfg: Union[Namespace, Dict]): + +
+[docs] +def get_init_configs(cfg: Union[Namespace, Dict]): """ set init configs of datajucer for cfg """ @@ -904,6 +924,7 @@

Source code for data_juicer.config.config

         json.dump(cfg, f)
     inited_dj_cfg = init_configs(['--config', temp_file])
     return inited_dj_cfg
+
diff --git a/_modules/data_juicer/core/adapter.html b/_modules/data_juicer/core/adapter.html index 1ca13cc18..eee2749b2 100644 --- a/_modules/data_juicer/core/adapter.html +++ b/_modules/data_juicer/core/adapter.html @@ -1,18 +1,18 @@ - + - data_juicer.core.adapter — data_juicer 1.0.0 documentation + data_juicer.core.adapter — data_juicer 1.0.1 documentation - - - + + + @@ -85,15 +85,22 @@

Source code for data_juicer.core.adapter

 from data_juicer.utils.process_utils import setup_mp
 
 
-
[docs]class Adapter: +
+[docs] +class Adapter: MAX_BATCH_SIZE = 10000 -
[docs] def __init__(self, cfg: dict): +
+[docs] + def __init__(self, cfg: dict): self.cfg = cfg self.idle_resources = Monitor.monitor_current_resources()
-
[docs] @staticmethod + +
+[docs] + @staticmethod def execute_and_probe(dataset, operators, sample_interval=0.5): """ Process the input dataset and probe related information for each OP in @@ -149,7 +156,10 @@

Source code for data_juicer.core.adapter

 
         return resource_util_list
-
[docs] @staticmethod + +
+[docs] + @staticmethod def take_batch(dataset, config): """ Split the dataset into batches based on configuration and load factor. @@ -170,7 +180,10 @@

Source code for data_juicer.core.adapter

         else:
             return dataset.take(batch_size)
-
[docs] def adapt_workloads(self, dataset, operators): + +
+[docs] + def adapt_workloads(self, dataset, operators): """ Manage the scheduling and load balancing for the dataset processing. @@ -187,7 +200,10 @@

Source code for data_juicer.core.adapter

 
         return bs_per_op
-
[docs] def probe_small_batch(self, dataset, operators): + +
+[docs] + def probe_small_batch(self, dataset, operators): """ Perform small batch pre-execution to probe available resources, current load and estimated OP speed, returning load factors and speed @@ -220,7 +236,10 @@

Source code for data_juicer.core.adapter

 
         return analysis_res, len(data_batch)
-
[docs] def batch_size_strategy(self, load_analysis_res, base_bs=1, util_th=0.9): + +
+[docs] + def batch_size_strategy(self, load_analysis_res, base_bs=1, util_th=0.9): """ Decide the batch size for each op according to their workload analysis result and expected utilization threshold. We need to guarantee that @@ -255,7 +274,9 @@

Source code for data_juicer.core.adapter

                              self.MAX_BATCH_SIZE)
             batch_size_per_op.append(bs_this_op)
 
-        return batch_size_per_op
+ return batch_size_per_op
+
+
diff --git a/_modules/data_juicer/core/analyzer.html b/_modules/data_juicer/core/analyzer.html index 727834f6d..de93e44a0 100644 --- a/_modules/data_juicer/core/analyzer.html +++ b/_modules/data_juicer/core/analyzer.html @@ -1,18 +1,18 @@ - + - data_juicer.core.analyzer — data_juicer 1.0.0 documentation + data_juicer.core.analyzer — data_juicer 1.0.1 documentation - - - + + + @@ -95,7 +95,9 @@

Source code for data_juicer.core.analyzer

 from .exporter import Exporter
 
 
-
[docs]class Analyzer: +
+[docs] +class Analyzer: """ This Analyzer class is used to analyze a specific dataset. @@ -106,7 +108,9 @@

Source code for data_juicer.core.analyzer

     dataset better.
     """
 
-
[docs] def __init__(self, cfg: Optional[Namespace] = None): +
+[docs] + def __init__(self, cfg: Optional[Namespace] = None): """ Initialization method. @@ -149,7 +153,10 @@

Source code for data_juicer.core.analyzer

         self.overall_single_plot_path = None
         self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
-
[docs] def run(self, + +
+[docs] + def run(self, load_data_np: Optional[PositiveInt] = None, skip_export: bool = False, skip_return: bool = False): @@ -229,7 +236,9 @@

Source code for data_juicer.core.analyzer

         column_wise_analysis.analyze(skip_export=skip_export)
 
         if not skip_return:
-            return dataset
+ return dataset
+
+
diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html index c58c269b9..175f6034d 100644 --- a/_modules/data_juicer/core/data.html +++ b/_modules/data_juicer/core/data.html @@ -1,18 +1,18 @@ - + - data_juicer.core.data — data_juicer 1.0.0 documentation + data_juicer.core.data — data_juicer 1.0.1 documentation - - - + + + @@ -220,10 +220,14 @@

Source code for data_juicer.core.data

         return super().map(**args)
 
 
-
[docs]class NestedDataset(Dataset, DJDataset): +
+[docs] +class NestedDataset(Dataset, DJDataset): """Enhanced HuggingFace-Dataset for better usability and efficiency.""" -
[docs] def __init__(self, *args, **kargs): +
+[docs] + def __init__(self, *args, **kargs): if len(args) == 1 and isinstance(args[0], Dataset): # init from another Dataset instance self.__dict__ = copy.copy(args[0].__dict__) @@ -233,6 +237,7 @@

Source code for data_juicer.core.data

 
         self.need_to_cleanup_caches = not is_caching_enabled()
+ def __getitem__(self, key): if isinstance(key, str): # to index columns by query as string name(s) @@ -243,13 +248,18 @@

Source code for data_juicer.core.data

             res = super().__getitem__(key)
         return nested_obj_factory(res)
 
-
[docs] def process(self, - operators, - *, - work_dir=None, - exporter=None, - checkpointer=None, - tracer=None): +
+[docs] + def process( + self, + operators, + *, + work_dir=None, + exporter=None, + checkpointer=None, + tracer=None, + open_monitor=True, + ): if operators is None: return self @@ -258,7 +268,8 @@

Source code for data_juicer.core.data

         unforkable_operators = set(UNFORKABLE.modules.keys())
 
         # resource utilization monitor
-        resource_util_list = []
+        if open_monitor:
+            resource_util_list = []
 
         dataset = self
         try:
@@ -275,12 +286,16 @@ 

Source code for data_juicer.core.data

                     'exporter': exporter,
                     'tracer': tracer,
                 }
-                dataset, resource_util_per_op = Monitor.monitor_func(
-                    op.run, args=run_args)
+                if open_monitor:
+                    dataset, resource_util_per_op = Monitor.monitor_func(
+                        op.run, args=run_args)
+                else:
+                    dataset = op.run(**run_args)
                 # record processed ops
                 if checkpointer is not None:
                     checkpointer.record(op._op_cfg)
-                resource_util_list.append(resource_util_per_op)
+                if open_monitor:
+                    resource_util_list.append(resource_util_per_op)
                 end = time()
                 logger.info(f'OP [{op._name}] Done in {end - start:.3f}s. '
                             f'Left {len(dataset)} samples.')
@@ -294,7 +309,10 @@ 

Source code for data_juicer.core.data

                             'last op...')
                 dataset.cleanup_cache_files()
                 checkpointer.save_ckpt(dataset)
-            if work_dir:
+            if work_dir and open_monitor:
+                # get the analyzed version
+                resource_util_list = Monitor.analyze_resource_util_list(
+                    resource_util_list)
                 monitor_dir = os.path.join(work_dir, 'monitor')
                 os.makedirs(monitor_dir, exist_ok=True)
                 with open(os.path.join(monitor_dir, 'monitor.json'),
@@ -304,9 +322,10 @@ 

Source code for data_juicer.core.data

                                                  monitor_dir)
         return dataset
-
[docs] def map(self, *args, **kargs): - """Override the map func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" + +
+[docs] + def update_args(self, args, kargs, is_filter=False): if args: args = list(args) # the first positional para is function @@ -332,15 +351,17 @@

Source code for data_juicer.core.data

             # batched is required for fault-tolerant or batched OP
             if callable(getattr(
                     called_func.__self__,
-                    'is_batched_op')) and called_func.__self__.is_batched_op(
-                    ) or not getattr(called_func.__self__, 'turbo', False):
+                    'is_batched_op')) and called_func.__self__.is_batched_op():
                 kargs['batched'] = True
                 kargs['batch_size'] = kargs.pop('batch_size', 1)
+            elif not getattr(called_func.__self__, 'turbo', False):
+                kargs['batched'] = True
+                kargs['batch_size'] = 1
             else:
                 kargs['batched'] = False
 
-            # rank is required for cuda model loading
-            if callable(
+            # rank is required for cuda model loading for map
+            if not is_filter and callable(
                     getattr(called_func.__self__,
                             'use_cuda')) and called_func.__self__.use_cuda():
                 kargs['with_rank'] = True
@@ -349,6 +370,17 @@ 

Source code for data_juicer.core.data

             new_fingerprint = generate_fingerprint(self, *args, **kargs)
             kargs['new_fingerprint'] = new_fingerprint
 
+        return args, kargs
+ + +
+[docs] + def map(self, *args, **kargs): + """Override the map func, which is called by most common operations, + such that the processed samples can be accessed by nested manner.""" + + args, kargs = self.update_args(args, kargs) + if cache_utils.CACHE_COMPRESS: decompress(self, kargs['new_fingerprint'], kargs['num_proc'] if 'num_proc' in kargs else 1) @@ -364,41 +396,13 @@

Source code for data_juicer.core.data

 
         return new_ds
-
[docs] def filter(self, *args, **kargs): + +
+[docs] + def filter(self, *args, **kargs): """Override the filter func, which is called by most common operations, such that the processed samples can be accessed by nested manner.""" - if args: - args = list(args) - # the first positional para is function - if args[0] is None: - args[0] = lambda x: nested_obj_factory(x) - else: - args[0] = wrap_func_with_nested_access(args[0]) - called_func = args[0] - else: - if 'function' not in kargs or kargs['function'] is None: - kargs['function'] = lambda x: nested_obj_factory(x) - else: - kargs['function'] = wrap_func_with_nested_access( - kargs['function']) - called_func = kargs['function'] - - # For wrapped function, try to get its unwrapped (bound) method - while not inspect.ismethod(called_func) and hasattr( - called_func, '__wrapped__'): - called_func = called_func.__wrapped__ - - # Batched is always required for fault tolerance - if inspect.ismethod(called_func): - if callable(getattr( - called_func.__self__, - 'is_batched_op')) and called_func.__self__.is_batched_op(): - kargs['batched'] = True - kargs['batch_size'] = kargs.pop('batch_size', 1) - - if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: - new_fingerprint = generate_fingerprint(self, *args, **kargs) - kargs['new_fingerprint'] = new_fingerprint + args, kargs = self.update_args(args, kargs, is_filter=True) # For filter, it involves a map and a filter operations, so the final # cache files includes two sets with different fingerprint (before and @@ -428,42 +432,65 @@

Source code for data_juicer.core.data

 
         return new_ds
-
[docs] def select(self, *args, **kargs): + +
+[docs] + def select(self, *args, **kargs): """Override the select func, such that selected samples can be accessed by nested manner.""" return nested_obj_factory(super().select(*args, **kargs))
-
[docs] @classmethod + +
+[docs] + @classmethod def from_dict(cls, *args, **kargs): """Override the from_dict func, which is called by most from_xx constructors, such that the constructed dataset object is NestedDataset.""" return NestedDataset(super().from_dict(*args, **kargs))
-
[docs] def add_column(self, *args, **kargs): + +
+[docs] + def add_column(self, *args, **kargs): """Override the add column func, such that the processed samples can be accessed by nested manner.""" return NestedDataset(super().add_column(*args, **kargs))
-
[docs] def select_columns(self, *args, **kargs): + +
+[docs] + def select_columns(self, *args, **kargs): """Override the select columns func, such that the processed samples can be accessed by nested manner.""" return NestedDataset(super().select_columns(*args, **kargs))
-
[docs] def remove_columns(self, *args, **kargs): + +
+[docs] + def remove_columns(self, *args, **kargs): """Override the remove columns func, such that the processed samples can be accessed by nested manner.""" return NestedDataset(super().remove_columns(*args, **kargs))
-
[docs] def cleanup_cache_files(self): + +
+[docs] + def cleanup_cache_files(self): """Override the cleanup_cache_files func, clear raw and compressed cache files.""" cleanup_compressed_cache_files(self) return super().cleanup_cache_files()
-
[docs] @staticmethod + +
+[docs] + @staticmethod def load_from_disk(*args, **kargs): - return NestedDataset(Dataset.load_from_disk(*args, **kargs))
+ return NestedDataset(Dataset.load_from_disk(*args, **kargs))
+
+ def nested_query(root_obj: Union[NestedDatasetDict, NestedDataset, diff --git a/_modules/data_juicer/core/executor.html b/_modules/data_juicer/core/executor.html index 60d797f54..e495e8e78 100644 --- a/_modules/data_juicer/core/executor.html +++ b/_modules/data_juicer/core/executor.html @@ -1,18 +1,18 @@ - + - data_juicer.core.executor — data_juicer 1.0.0 documentation + data_juicer.core.executor — data_juicer 1.0.1 documentation - - - + + + @@ -103,7 +103,9 @@

Source code for data_juicer.core.executor

 from .tracer import Tracer
 
 
-
[docs]class Executor: +
+[docs] +class Executor: """ This Executor class is used to process a specific dataset. @@ -111,7 +113,9 @@

Source code for data_juicer.core.executor

     ops in the config file in order and generate a processed dataset.
     """
 
-
[docs] def __init__(self, cfg: Optional[Namespace] = None): +
+[docs] + def __init__(self, cfg: Optional[Namespace] = None): """ Initialization method. @@ -175,7 +179,10 @@

Source code for data_juicer.core.executor

                 logger.info('Trace for all ops.')
                 self.op_list_to_trace = set(OPERATORS.modules.keys())
-
[docs] def sample_data(self, + +
+[docs] + def sample_data(self, dataset_to_sample: Dataset = None, load_data_np=None, sample_ratio: float = 1.0, @@ -221,7 +228,10 @@

Source code for data_juicer.core.executor

         else:
             raise ValueError(f'Unsupported sample_algo: {sample_algo}')
-
[docs] def run(self, + +
+[docs] + def run(self, load_data_np: Optional[PositiveInt] = None, skip_return=False): """ @@ -272,11 +282,14 @@

Source code for data_juicer.core.executor

         # - If checkpoint is open, clean the cache files after each process
         logger.info('Processing data...')
         tstart = time()
-        dataset = dataset.process(ops,
-                                  work_dir=self.work_dir,
-                                  exporter=self.exporter,
-                                  checkpointer=self.ckpt_manager,
-                                  tracer=self.tracer)
+        dataset = dataset.process(
+            ops,
+            work_dir=self.work_dir,
+            exporter=self.exporter,
+            checkpointer=self.ckpt_manager,
+            tracer=self.tracer,
+            open_monitor=self.cfg.open_monitor,
+        )
         tend = time()
         logger.info(f'All OPs are done in {tend - tstart:.3f}s.')
 
@@ -289,7 +302,9 @@ 

Source code for data_juicer.core.executor

             compress(dataset)
 
         if not skip_return:
-            return dataset
+ return dataset
+
+
diff --git a/_modules/data_juicer/core/exporter.html b/_modules/data_juicer/core/exporter.html index 0936f6c97..6d7b5fc06 100644 --- a/_modules/data_juicer/core/exporter.html +++ b/_modules/data_juicer/core/exporter.html @@ -1,18 +1,18 @@ - + - data_juicer.core.exporter — data_juicer 1.0.0 documentation + data_juicer.core.exporter — data_juicer 1.0.1 documentation - - - + + + @@ -85,7 +85,9 @@

Source code for data_juicer.core.exporter

 from data_juicer.utils.constant import Fields, HashKeys
 
 
-
[docs]class Exporter: +
+[docs] +class Exporter: """The Exporter class is used to export a dataset to files of specific format.""" @@ -94,7 +96,9 @@

Source code for data_juicer.core.exporter

     GiB = 2**30  # 1024*1024*1024
     TiB = 2**40  # 1024*1024*1024*1024
 
-
[docs] def __init__(self, +
+[docs] + def __init__(self, export_path, export_shard_size=0, export_in_parallel=True, @@ -157,6 +161,7 @@

Source code for data_juicer.core.exporter

                            f'single shard file and make loading and exporting '
                            f'slower.')
+ def _get_suffix(self, export_path): """ Get the suffix of export path and check if it's supported. @@ -267,7 +272,9 @@

Source code for data_juicer.core.exporter

                 pool.close()
                 pool.join()
 
-
[docs] def export(self, dataset): +
+[docs] + def export(self, dataset): """ Export method for a dataset. @@ -277,7 +284,10 @@

Source code for data_juicer.core.exporter

         self._export_impl(dataset, self.export_path, self.suffix,
                           self.export_stats)
-
[docs] def export_compute_stats(self, dataset, export_path): + +
+[docs] + def export_compute_stats(self, dataset, export_path): """ Export method for saving compute status in filters """ @@ -289,7 +299,10 @@

Source code for data_juicer.core.exporter

                           export_stats=False)
         self.keep_stats_in_res_ds = keep_stats_in_res_ds
-
[docs] @staticmethod + +
+[docs] + @staticmethod def to_jsonl(dataset, export_path, num_proc=1, **kwargs): """ Export method for jsonl target files. @@ -302,7 +315,10 @@

Source code for data_juicer.core.exporter

         """
         dataset.to_json(export_path, force_ascii=False, num_proc=num_proc)
-
[docs] @staticmethod + +
+[docs] + @staticmethod def to_json(dataset, export_path, num_proc=1, **kwargs): """ Export method for json target files. @@ -318,7 +334,10 @@

Source code for data_juicer.core.exporter

                         num_proc=num_proc,
                         lines=False)
-
[docs] @staticmethod + +
+[docs] + @staticmethod def to_parquet(dataset, export_path, **kwargs): """ Export method for parquet target files. @@ -330,6 +349,7 @@

Source code for data_juicer.core.exporter

         """
         dataset.to_parquet(export_path)
+ # suffix to export method @staticmethod def _router(): @@ -343,6 +363,7 @@

Source code for data_juicer.core.exporter

             'json': Exporter.to_json,
             'parquet': Exporter.to_parquet,
         }
+
diff --git a/_modules/data_juicer/core/monitor.html b/_modules/data_juicer/core/monitor.html index 95abcb3c3..9736354cd 100644 --- a/_modules/data_juicer/core/monitor.html +++ b/_modules/data_juicer/core/monitor.html @@ -1,18 +1,18 @@ - + - data_juicer.core.monitor — data_juicer 1.0.0 documentation + data_juicer.core.monitor — data_juicer 1.0.1 documentation - - - + + + @@ -99,7 +99,9 @@

Source code for data_juicer.core.monitor

     mdict['resource'] = this_states
 
 
-
[docs]class Monitor: +
+[docs] +class Monitor: """ Monitor resource utilization and other information during the data processing. @@ -157,17 +159,25 @@

Source code for data_juicer.core.monitor

         'GPU util.',
     }
 
-
[docs] def __init__(self): +
+[docs] + def __init__(self): pass
-
[docs] def monitor_all_resources(self): + +
+[docs] + def monitor_all_resources(self): """ Detect the resource utilization of all distributed nodes. """ # TODO raise NotImplementedError
-
[docs] @staticmethod + +
+[docs] + @staticmethod def monitor_current_resources(): """ Detect the resource utilization of the current environment/machine. @@ -200,7 +210,10 @@

Source code for data_juicer.core.monitor

 
         return resource_dict
-
[docs] @staticmethod + +
+[docs] + @staticmethod def draw_resource_util_graph(resource_util_list, store_dir): import matplotlib.pyplot as plt for idx, resource_util_dict in enumerate(resource_util_list): @@ -218,7 +231,10 @@

Source code for data_juicer.core.monitor

                 plt.savefig(os.path.join(store_dir, fn), bbox_inches='tight')
                 plt.clf()
-
[docs] @staticmethod + +
+[docs] + @staticmethod def analyze_resource_util_list(resource_util_list): """ Analyze the resource utilization for a given resource util list. @@ -229,7 +245,10 @@

Source code for data_juicer.core.monitor

             res_list.append(Monitor.analyze_single_resource_util(item))
         return res_list
-
[docs] @staticmethod + +
+[docs] + @staticmethod def analyze_single_resource_util(resource_util_dict): """ Analyze the resource utilization for a single resource util dict. @@ -258,7 +277,10 @@

Source code for data_juicer.core.monitor

 
         return resource_util_dict
-
[docs] @staticmethod + +
+[docs] + @staticmethod def monitor_func(func, args=None, sample_interval=0.5): """ Process the input dataset and probe related information for each OP in @@ -284,7 +306,10 @@

Source code for data_juicer.core.monitor

         resource_util_dict = {}
 
         # start monitor
-        ctx = get_context('fork')
+        start_method = 'fork'
+        if os.name == 'nt':  # for Windows
+            start_method = 'spawn'
+        ctx = get_context(start_method)
         with ctx.Manager() as manager:
             mdict = manager.dict()
             mdict['stop'] = False
@@ -315,7 +340,9 @@ 

Source code for data_juicer.core.monitor

             # calculate speed
             resource_util_dict['time'] = end - start
 
-        return ret, resource_util_dict
+ return ret, resource_util_dict
+
+
diff --git a/_modules/data_juicer/core/tracer.html b/_modules/data_juicer/core/tracer.html index b2cede0c6..ca242ffc9 100644 --- a/_modules/data_juicer/core/tracer.html +++ b/_modules/data_juicer/core/tracer.html @@ -1,18 +1,18 @@ - + - data_juicer.core.tracer — data_juicer 1.0.0 documentation + data_juicer.core.tracer — data_juicer 1.0.1 documentation - - - + + + @@ -84,7 +84,9 @@

Source code for data_juicer.core.tracer

 from loguru import logger
 
 
-
[docs]class Tracer: +
+[docs] +class Tracer: """ The tracer to trace the sample changes before and after an operator process. @@ -92,7 +94,9 @@

Source code for data_juicer.core.tracer

     The comparison results will be stored in the work directory.
     """
 
-
[docs] def __init__(self, work_dir, show_num=10): +
+[docs] + def __init__(self, work_dir, show_num=10): """ Initialization method. @@ -106,7 +110,10 @@

Source code for data_juicer.core.tracer

             os.makedirs(self.work_dir)
         self.show_num = show_num
-
[docs] def trace_mapper(self, op_name: str, previous_ds: Dataset, + +
+[docs] + def trace_mapper(self, op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str): """ Compare datasets before and after a Mapper. @@ -156,7 +163,10 @@

Source code for data_juicer.core.tracer

                        lines=True,
                        force_ascii=False)
-
[docs] def trace_batch_mapper(self, op_name: str, previous_ds: Dataset, + +
+[docs] + def trace_batch_mapper(self, op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str): """ Compare datasets before and after a BatchMapper. @@ -196,7 +206,10 @@

Source code for data_juicer.core.tracer

                        lines=True,
                        force_ascii=False)
-
[docs] def trace_filter(self, op_name: str, previous_ds: Dataset, + +
+[docs] + def trace_filter(self, op_name: str, previous_ds: Dataset, processed_ds: Dataset): """ Compare datasets before and after a Filter. @@ -256,7 +269,10 @@

Source code for data_juicer.core.tracer

                           lines=True,
                           force_ascii=False)
-
[docs] def trace_deduplicator(self, op_name: str, dup_pairs: list): + +
+[docs] + def trace_deduplicator(self, op_name: str, dup_pairs: list): """ Compare datasets before and after a Deduplicator. @@ -300,7 +316,9 @@

Source code for data_juicer.core.tracer

         dup_df.to_json(os.path.join(self.work_dir, res_name),
                        orient='records',
                        lines=True,
-                       force_ascii=False)
+ force_ascii=False)
+
+
diff --git a/_modules/data_juicer/format/csv_formatter.html b/_modules/data_juicer/format/csv_formatter.html index 8c74645d7..18059e900 100644 --- a/_modules/data_juicer/format/csv_formatter.html +++ b/_modules/data_juicer/format/csv_formatter.html @@ -1,18 +1,18 @@ - + - data_juicer.format.csv_formatter — data_juicer 1.0.0 documentation + data_juicer.format.csv_formatter — data_juicer 1.0.1 documentation - - - + + + @@ -80,7 +80,9 @@

Source code for data_juicer.format.csv_formatter

from .formatter import FORMATTERS, LocalFormatter -
[docs]@FORMATTERS.register_module() +
+[docs] +@FORMATTERS.register_module() class CsvFormatter(LocalFormatter): """ The class is used to load and format csv-type files. @@ -89,7 +91,9 @@

Source code for data_juicer.format.csv_formatter

""" SUFFIXES = ['.csv'] -
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): +
+[docs] + def __init__(self, dataset_path, suffixes=None, **kwargs): """ Initialization method. @@ -102,7 +106,9 @@

Source code for data_juicer.format.csv_formatter

suffixes=suffixes if suffixes else self.SUFFIXES, type='csv', **kwargs, - )
+ )
+
+
diff --git a/_modules/data_juicer/format/empty_formatter.html b/_modules/data_juicer/format/empty_formatter.html index 4affd7e1b..235ceabdf 100644 --- a/_modules/data_juicer/format/empty_formatter.html +++ b/_modules/data_juicer/format/empty_formatter.html @@ -1,18 +1,18 @@ - + - data_juicer.format.empty_formatter — data_juicer 1.0.0 documentation + data_juicer.format.empty_formatter — data_juicer 1.0.1 documentation - - - + + + @@ -89,14 +89,18 @@

Source code for data_juicer.format.empty_formatter

ray = LazyLoader('ray', 'ray') -
[docs]@FORMATTERS.register_module() +
+[docs] +@FORMATTERS.register_module() class EmptyFormatter(BaseFormatter): """ The class is used to create empty data. """ SUFFIXES = [] -
[docs] def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs): +
+[docs] + def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs): """ Initialization method. @@ -108,11 +112,14 @@

Source code for data_juicer.format.empty_formatter

if isinstance(self.feature_keys, str): self.feature_keys = [self.feature_keys]
+ @property def null_value(self): return None -
[docs] def load_dataset(self, *args, **kwargs): +
+[docs] + def load_dataset(self, *args, **kwargs): data_dict = {} features = Features() @@ -126,17 +133,23 @@

Source code for data_juicer.format.empty_formatter

from data_juicer.core.data import NestedDataset empty_dataset = NestedDataset(empty_dataset) - return empty_dataset
+ return empty_dataset
+
+ -
[docs]@FORMATTERS.register_module() +
+[docs] +@FORMATTERS.register_module() class RayEmptyFormatter(BaseFormatter): """ The class is used to create empty data for ray. """ SUFFIXES = [] -
[docs] def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs): +
+[docs] + def __init__(self, length, feature_keys: List[str] = [], *args, **kwargs): """ Initialization method. @@ -148,11 +161,14 @@

Source code for data_juicer.format.empty_formatter

if isinstance(self.feature_keys, str): self.feature_keys = [self.feature_keys]
+ @property def null_value(self): return {} -
[docs] def load_dataset(self, *args, **kwargs): +
+[docs] + def load_dataset(self, *args, **kwargs): if len(self.feature_keys): df = pd.DataFrame({ col: [self.null_value for _ in range(self.length)] @@ -163,7 +179,9 @@

Source code for data_juicer.format.empty_formatter

empty_dataset = ray.data.from_pandas(df) - return empty_dataset
+ return empty_dataset
+
+
diff --git a/_modules/data_juicer/format/formatter.html b/_modules/data_juicer/format/formatter.html index 0ba0848c1..5ac386117 100644 --- a/_modules/data_juicer/format/formatter.html +++ b/_modules/data_juicer/format/formatter.html @@ -1,18 +1,18 @@ - + - data_juicer.format.formatter — data_juicer 1.0.0 documentation + data_juicer.format.formatter — data_juicer 1.0.1 documentation - - - + + + @@ -98,11 +98,15 @@

Source code for data_juicer.format.formatter

raise NotImplementedError
 
 
-
[docs]class LocalFormatter(BaseFormatter): +
+[docs] +class LocalFormatter(BaseFormatter): """The class is used to load a dataset from local files or local directory.""" -
[docs] def __init__( +
+[docs] + def __init__( self, dataset_path: str, type: str, @@ -130,7 +134,10 @@

Source code for data_juicer.format.formatter

self.data_files = find_files_with_suffix(dataset_path, suffixes)
         self.add_suffix = add_suffix
-
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: + +
+[docs] + def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from dataset file or dataset directory, and unify its format. @@ -157,14 +164,20 @@

Source code for data_juicer.format.formatter

text_keys=self.text_keys,
                           num_proc=num_proc,
                           global_cfg=global_cfg)
-        return ds
+ return ds
+
+ -
[docs]class RemoteFormatter(BaseFormatter): +
+[docs] +class RemoteFormatter(BaseFormatter): """The class is used to load a dataset from repository of huggingface hub.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, dataset_path: str, text_keys: List[str] = None, **kwargs): @@ -180,7 +193,10 @@

Source code for data_juicer.format.formatter

self.text_keys = text_keys
         self.kwargs = kwargs
-
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: + +
+[docs] + def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from HuggingFace, and unify its format. @@ -196,7 +212,9 @@

Source code for data_juicer.format.formatter

text_keys=self.text_keys,
                           num_proc=num_proc,
                           global_cfg=global_cfg)
-        return ds
+ return ds
+
+ def add_suffixes(datasets: DatasetDict, num_proc: int = 1) -> Dataset: diff --git a/_modules/data_juicer/format/json_formatter.html b/_modules/data_juicer/format/json_formatter.html index 9300a3e39..98dc91960 100644 --- a/_modules/data_juicer/format/json_formatter.html +++ b/_modules/data_juicer/format/json_formatter.html @@ -1,18 +1,18 @@ - + - data_juicer.format.json_formatter — data_juicer 1.0.0 documentation + data_juicer.format.json_formatter — data_juicer 1.0.1 documentation - - - + + + @@ -80,7 +80,9 @@

Source code for data_juicer.format.json_formatter

from .formatter import FORMATTERS, LocalFormatter -
[docs]@FORMATTERS.register_module() +
+[docs] +@FORMATTERS.register_module() class JsonFormatter(LocalFormatter): """ The class is used to load and format json-type files. @@ -89,7 +91,9 @@

Source code for data_juicer.format.json_formatter

""" SUFFIXES = ['.json', '.jsonl', '.jsonl.zst'] -
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): +
+[docs] + def __init__(self, dataset_path, suffixes=None, **kwargs): """ Initialization method. @@ -102,7 +106,9 @@

Source code for data_juicer.format.json_formatter

suffixes=suffixes if suffixes else self.SUFFIXES, type='json', **kwargs, - )
+ )
+
+
diff --git a/_modules/data_juicer/format/load.html b/_modules/data_juicer/format/load.html index 4c392474b..f947568be 100644 --- a/_modules/data_juicer/format/load.html +++ b/_modules/data_juicer/format/load.html @@ -1,18 +1,18 @@ - + - data_juicer.format.load — data_juicer 1.0.0 documentation + data_juicer.format.load — data_juicer 1.0.1 documentation - - - + + + @@ -81,7 +81,9 @@

Source code for data_juicer.format.load

 from .mixture_formatter import MixtureFormatter
 
 
-
[docs]def load_formatter(dataset_path, +
+[docs] +def load_formatter(dataset_path, generated_dataset_config=None, text_keys=None, suffixes=[], @@ -118,6 +120,7 @@

Source code for data_juicer.format.load

                                  add_suffix=add_suffix,
                                  **kwargs)
     return formatter
+
diff --git a/_modules/data_juicer/format/mixture_formatter.html b/_modules/data_juicer/format/mixture_formatter.html index 4236fcd32..ef735a90f 100644 --- a/_modules/data_juicer/format/mixture_formatter.html +++ b/_modules/data_juicer/format/mixture_formatter.html @@ -1,18 +1,18 @@ - + - data_juicer.format.mixture_formatter — data_juicer 1.0.0 documentation + data_juicer.format.mixture_formatter — data_juicer 1.0.1 documentation - - - + + + @@ -87,12 +87,16 @@

Source code for data_juicer.format.mixture_formatter

from .formatter import BaseFormatter, load_formatter -
[docs]class MixtureFormatter(BaseFormatter): +
+[docs] +class MixtureFormatter(BaseFormatter): """The class mixes multiple datasets by randomly selecting samples from every dataset and merging them, and then exports the merged datasset as a new mixed dataset.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, dataset_path: str, suffixes: Union[str, List[str], None] = None, text_keys=None, @@ -142,6 +146,7 @@

Source code for data_juicer.format.mixture_formatter

**kwargs) for data_prefix in data_prefixes ]
+ def _get_weight(self, data_prefix): """ Split every dataset path and its weight. @@ -167,7 +172,9 @@

Source code for data_juicer.format.mixture_formatter

prefixes.append(value) return prefixes, weights -
[docs] @classmethod +
+[docs] + @classmethod def random_sample(cls, dataset, weight=1.0, sample_number=0, seed=None): """ Randomly sample a subset from a dataset with weight or number, @@ -199,7 +206,10 @@

Source code for data_juicer.format.mixture_formatter

return dataset.shuffle(seed=seed).select(sample_index)
-
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: + +
+[docs] + def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a mixed dataset. @@ -220,7 +230,9 @@

Source code for data_juicer.format.mixture_formatter

from data_juicer.core.data import NestedDataset mixed_dataset = NestedDataset(concatenate_datasets(dataset_list)) logger.info(f'There are {len(mixed_dataset)} in final dataset') - return mixed_dataset
+ return mixed_dataset
+
+
diff --git a/_modules/data_juicer/format/parquet_formatter.html b/_modules/data_juicer/format/parquet_formatter.html index 39c8b1b3a..021c82cf6 100644 --- a/_modules/data_juicer/format/parquet_formatter.html +++ b/_modules/data_juicer/format/parquet_formatter.html @@ -1,18 +1,18 @@ - + - data_juicer.format.parquet_formatter — data_juicer 1.0.0 documentation + data_juicer.format.parquet_formatter — data_juicer 1.0.1 documentation - - - + + + @@ -80,7 +80,9 @@

Source code for data_juicer.format.parquet_formatter

from .formatter import FORMATTERS, LocalFormatter -
[docs]@FORMATTERS.register_module() +
+[docs] +@FORMATTERS.register_module() class ParquetFormatter(LocalFormatter): """ The class is used to load and format parquet-type files. @@ -89,7 +91,9 @@

Source code for data_juicer.format.parquet_formatter

""" SUFFIXES = ['.parquet'] -
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): +
+[docs] + def __init__(self, dataset_path, suffixes=None, **kwargs): """ Initialization method. @@ -102,7 +106,9 @@

Source code for data_juicer.format.parquet_formatter

suffixes=suffixes if suffixes else self.SUFFIXES, type='parquet', **kwargs, - )
+ )
+
+
diff --git a/_modules/data_juicer/format/text_formatter.html b/_modules/data_juicer/format/text_formatter.html index 46820a783..ee70b89b9 100644 --- a/_modules/data_juicer/format/text_formatter.html +++ b/_modules/data_juicer/format/text_formatter.html @@ -1,18 +1,18 @@ - + - data_juicer.format.text_formatter — data_juicer 1.0.0 documentation + data_juicer.format.text_formatter — data_juicer 1.0.1 documentation - - - + + + @@ -131,7 +131,9 @@

Source code for data_juicer.format.text_formatter

f.write('\n'.join(text)) -
[docs]@FORMATTERS.register_module() +
+[docs] +@FORMATTERS.register_module() class TextFormatter(LocalFormatter): """ The class is used to load and format text-type files. @@ -151,7 +153,9 @@

Source code for data_juicer.format.text_formatter

'.m', '.smali' ] -
[docs] def __init__(self, +
+[docs] + def __init__(self, dataset_path, suffixes=None, add_suffix=False, @@ -175,7 +179,10 @@

Source code for data_juicer.format.text_formatter

self.dataset_path = dataset_path self.add_suffix = add_suffix
-
[docs] def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: + +
+[docs] + def load_dataset(self, num_proc: int = 1, global_cfg=None) -> Dataset: """ Load a dataset from local text-type files. @@ -235,7 +242,9 @@

Source code for data_juicer.format.text_formatter

return unify_format(datasets, text_keys=self.text_keys, num_proc=num_proc, - global_cfg=global_cfg)
+ global_cfg=global_cfg)
+
+
diff --git a/_modules/data_juicer/format/tsv_formatter.html b/_modules/data_juicer/format/tsv_formatter.html index 0e7e22baa..70f13b609 100644 --- a/_modules/data_juicer/format/tsv_formatter.html +++ b/_modules/data_juicer/format/tsv_formatter.html @@ -1,18 +1,18 @@ - + - data_juicer.format.tsv_formatter — data_juicer 1.0.0 documentation + data_juicer.format.tsv_formatter — data_juicer 1.0.1 documentation - - - + + + @@ -80,7 +80,9 @@

Source code for data_juicer.format.tsv_formatter

from .formatter import FORMATTERS, LocalFormatter -
[docs]@FORMATTERS.register_module() +
+[docs] +@FORMATTERS.register_module() class TsvFormatter(LocalFormatter): """ The class is used to load and format tsv-type files. @@ -89,7 +91,9 @@

Source code for data_juicer.format.tsv_formatter

""" SUFFIXES = ['.tsv'] -
[docs] def __init__(self, dataset_path, suffixes=None, **kwargs): +
+[docs] + def __init__(self, dataset_path, suffixes=None, **kwargs): """ Initialization method. @@ -103,7 +107,9 @@

Source code for data_juicer.format.tsv_formatter

type='csv', delimiter='\t', **kwargs, - )
+ )
+
+
diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html index 23cdbd2d8..cd8a40f1e 100644 --- a/_modules/data_juicer/ops/base_op.html +++ b/_modules/data_juicer/ops/base_op.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.base_op — data_juicer 1.0.0 documentation + data_juicer.ops.base_op — data_juicer 1.0.1 documentation - - - + + + @@ -149,7 +149,7 @@

Source code for data_juicer.ops.base_op

     return wrapper
 
 
-def catch_map_single_exception(method):
+def catch_map_single_exception(method, return_sample=True):
     """
     For single-map sample-level fault tolerance.
     The input sample is expected batch_size = 1.
@@ -171,8 +171,11 @@ 

Source code for data_juicer.ops.base_op

         if is_batched(sample):
             try:
                 sample = convert_dict_list_to_list_dict(sample)[0]
-                res_sample = method(sample, *args, **kwargs)
-                return convert_list_dict_to_dict_list([res_sample])
+                res = method(sample, *args, **kwargs)
+                if return_sample:
+                    return convert_list_dict_to_dict_list([res])
+                else:
+                    return [res]
             except Exception as e:
                 from loguru import logger
                 logger.error(
@@ -245,9 +248,8 @@ 

Source code for data_juicer.ops.base_op

                 method = wrap_func_with_nested_access(method)
                 setattr(self, name, method)
 
-    @classmethod
-    def is_batched_op(cls):
-        return cls._batched_op
+    def is_batched_op(self):
+        return self._batched_op
 
     def process(self, *args, **kwargs):
         raise NotImplementedError
@@ -299,9 +301,13 @@ 

Source code for data_juicer.ops.base_op

         return np.empty((0, 0), dtype=str)
 
 
-
[docs]class Mapper(OP): +
+[docs] +class Mapper(OP): -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Base class that conducts data editing. @@ -322,6 +328,7 @@

Source code for data_juicer.ops.base_op

         else:
             self.process = catch_map_single_exception(self.process_single)
+ # set the process method is not allowed to be overridden def __init_subclass__(cls, **kwargs): not_allowed_list = ['process'] @@ -332,7 +339,9 @@

Source code for data_juicer.ops.base_op

                     f'{cls.__name__}. Please implement {method_name}_single '
                     f'or {method_name}_batched.')
 
-
[docs] def process_batched(self, samples, *args, **kwargs): +
+[docs] + def process_batched(self, samples, *args, **kwargs): keys = samples.keys() first_key = next(iter(keys)) num_samples = len(samples[first_key]) @@ -344,7 +353,10 @@

Source code for data_juicer.ops.base_op

 
         return samples
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): """ For sample level, sample --> sample @@ -353,7 +365,10 @@

Source code for data_juicer.ops.base_op

         """
         raise NotImplementedError
-
[docs] def run(self, dataset, *, exporter=None, tracer=None): + +
+[docs] + def run(self, dataset, *, exporter=None, tracer=None): dataset = super(Mapper, self).run(dataset) new_dataset = dataset.map( self.process, @@ -365,12 +380,18 @@

Source code for data_juicer.ops.base_op

         if tracer:
             tracer.trace_mapper(self._name, dataset, new_dataset,
                                 self.text_key)
-        return new_dataset
+ return new_dataset
+
+ -
[docs]class Filter(OP): +
+[docs] +class Filter(OP): -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Base class that removes specific info. @@ -394,7 +415,9 @@

Source code for data_juicer.ops.base_op

         else:
             self.compute_stats = catch_map_single_exception(
                 self.compute_stats_single)
-            self.process = catch_map_single_exception(self.process_single)
+ self.process = catch_map_single_exception(self.process_single, + return_sample=False)
+ # set the process method is not allowed to be overridden def __init_subclass__(cls, **kwargs): @@ -406,7 +429,9 @@

Source code for data_juicer.ops.base_op

                     f'{cls.__name__}. Please implement {method_name}_single '
                     f'or {method_name}_batched.')
 
-
[docs] def compute_stats_batched(self, samples, *args, **kwargs): +
+[docs] + def compute_stats_batched(self, samples, *args, **kwargs): keys = samples.keys() num_samples = len(samples[Fields.stats]) for i in range(num_samples): @@ -419,11 +444,17 @@

Source code for data_juicer.ops.base_op

 
         return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): return map(lambda stat: self.process_single({Fields.stats: stat}), samples[Fields.stats])
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): """ Compute stats for the sample which is used as a metric to decide whether to filter this sample. @@ -435,7 +466,10 @@

Source code for data_juicer.ops.base_op

         """
         raise NotImplementedError
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): """ For sample level, sample --> Boolean. @@ -444,7 +478,10 @@

Source code for data_juicer.ops.base_op

         """
         raise NotImplementedError
-
[docs] def run(self, dataset, *, exporter=None, tracer=None, reduce=True): + +
+[docs] + def run(self, dataset, *, exporter=None, tracer=None, reduce=True): dataset = super(Filter, self).run(dataset) if Fields.stats not in dataset.features: from data_juicer.core.data import add_same_content_to_new_column @@ -472,12 +509,18 @@

Source code for data_juicer.ops.base_op

                 tracer.trace_filter(self._name, dataset, new_dataset)
             return new_dataset
         else:
-            return dataset
+ return dataset
+
+ -
[docs]class Deduplicator(OP): +
+[docs] +class Deduplicator(OP): -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Base class that conducts deduplication. @@ -498,7 +541,10 @@

Source code for data_juicer.ops.base_op

         else:
             self.compute_hash = catch_map_single_exception(self.compute_hash)
-
[docs] def compute_hash(self, sample): + +
+[docs] + def compute_hash(self, sample): """ Compute hash values for the sample. @@ -507,7 +553,10 @@

Source code for data_juicer.ops.base_op

         """
         raise NotImplementedError
-
[docs] def process(self, dataset, show_num=0): + +
+[docs] + def process(self, dataset, show_num=0): """ For doc-level, dataset --> dataset. @@ -518,7 +567,10 @@

Source code for data_juicer.ops.base_op

         """
         raise NotImplementedError
-
[docs] def run(self, dataset, *, exporter=None, tracer=None, reduce=True): + +
+[docs] + def run(self, dataset, *, exporter=None, tracer=None, reduce=True): dataset = super(Deduplicator, self).run(dataset) dataset = dataset.map(self.compute_hash, num_proc=self.runtime_np(), @@ -531,12 +583,18 @@

Source code for data_juicer.ops.base_op

                 tracer.trace_deduplicator(self._name, dup_pairs)
             return new_dataset
         else:
-            return dataset
+ return dataset
+
-
[docs]class Selector(OP): -
[docs] def __init__(self, *args, **kwargs): +
+[docs] +class Selector(OP): + +
+[docs] + def __init__(self, *args, **kwargs): """ Base class that conducts selection in dataset-level. @@ -551,7 +609,10 @@

Source code for data_juicer.ops.base_op

         """
         super(Selector, self).__init__(*args, **kwargs)
-
[docs] def process(self, dataset): + +
+[docs] + def process(self, dataset): """ Dataset --> dataset. @@ -560,12 +621,17 @@

Source code for data_juicer.ops.base_op

         """
         raise NotImplementedError
-
[docs] def run(self, dataset, *, exporter=None, tracer=None): + +
+[docs] + def run(self, dataset, *, exporter=None, tracer=None): dataset = super(Selector, self).run(dataset) new_dataset = self.process(dataset) if tracer: tracer.trace_filter(self._name, dataset, new_dataset) - return new_dataset
+ return new_dataset
+
+
diff --git a/_modules/data_juicer/ops/common/helper_func.html b/_modules/data_juicer/ops/common/helper_func.html index 0d730be3f..4040d75fa 100644 --- a/_modules/data_juicer/ops/common/helper_func.html +++ b/_modules/data_juicer/ops/common/helper_func.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.common.helper_func — data_juicer 1.0.0 documentation + data_juicer.ops.common.helper_func — data_juicer 1.0.1 documentation - - - + + + @@ -104,7 +104,9 @@

Source code for data_juicer.ops.common.helper_func

self.parent[px] = self.parent[py] = min(px, py) -
[docs]def strip(document, strip_characters): +
+[docs] +def strip(document, strip_characters): """ Way faster than document.strip(strip_characters) since strip_characters is now a set instead of a str, and it contains a lot of elements (all the @@ -132,7 +134,10 @@

Source code for data_juicer.ops.common.helper_func

return document_stripped
-
[docs]def split_on_whitespace(document, new_line=False, tab=False): + +
+[docs] +def split_on_whitespace(document, new_line=False, tab=False): """ This method also removes concatenated spaces. @@ -148,7 +153,10 @@

Source code for data_juicer.ops.common.helper_func

return split_document
-
[docs]def split_on_newline_tab_whitespace(document): + +
+[docs] +def split_on_newline_tab_whitespace(document): """ This method is used to split the document into different levels of sub- sentences. @@ -165,7 +173,10 @@

Source code for data_juicer.ops.common.helper_func

return sentences
-
[docs]def merge_on_whitespace_tab_newline(sentences): + +
+[docs] +def merge_on_whitespace_tab_newline(sentences): """ This method is used to merge different levels of sub-sentences into one document. Invert the method split_on_newline_tab_whitespace. Removes @@ -184,7 +195,10 @@

Source code for data_juicer.ops.common.helper_func

return document
-
[docs]def words_augmentation(words, group_size, join_char): + +
+[docs] +def words_augmentation(words, group_size, join_char): """ Augment words, especially for Chinese (without a space between words) and Vietnamese (with a space between syllables). @@ -201,7 +215,10 @@

Source code for data_juicer.ops.common.helper_func

return augmentation
-
[docs]def get_words_from_document( + +
+[docs] +def get_words_from_document( document, token_func=None, new_line=True, @@ -225,7 +242,10 @@

Source code for data_juicer.ops.common.helper_func

return words
-
[docs]def words_refinement(words, + +
+[docs] +def words_refinement(words, lower_case=False, strip_chars=None, use_words_aug=False, @@ -262,7 +282,10 @@

Source code for data_juicer.ops.common.helper_func

return words
-
[docs]def get_sentences_from_document(document, model_func=None): + +
+[docs] +def get_sentences_from_document(document, model_func=None): """ Get sentences from a document. @@ -279,7 +302,10 @@

Source code for data_juicer.ops.common.helper_func

return '\n'.join(sentences)
-
[docs]def split_text_by_punctuation(text): + +
+[docs] +def split_text_by_punctuation(text): """ Split text by any zh and en punctuation @@ -293,6 +319,7 @@

Source code for data_juicer.ops.common.helper_func

result = [s.strip() for s in result if s.strip()] return result
+
diff --git a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html index 2dd630459..71cdeef00 100644 --- a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.document_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.document_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -93,7 +93,9 @@

Source code for data_juicer.ops.deduplicator.document_deduplicator

from ..base_op import OPERATORS, Deduplicator -
[docs]@OPERATORS.register_module('document_deduplicator') +
+[docs] +@OPERATORS.register_module('document_deduplicator') class DocumentDeduplicator(Deduplicator): """ Deduplicator to deduplicate samples at document-level using exact matching. @@ -101,7 +103,9 @@

Source code for data_juicer.ops.deduplicator.document_deduplicator

Using md5 hash to deduplicate samples. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, lowercase: bool = False, ignore_non_character: bool = False, *args, @@ -121,7 +125,10 @@

Source code for data_juicer.ops.deduplicator.document_deduplicator

f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605 ) if ignore_non_character else None
-
[docs] def compute_hash(self, sample): + +
+[docs] + def compute_hash(self, sample): """ Compute md5 hash values for the sample. @@ -144,7 +151,10 @@

Source code for data_juicer.ops.deduplicator.document_deduplicator

sample[HashKeys.hash] = _get_hash(text) return sample
-
[docs] def process(self, dataset, show_num=0): + +
+[docs] + def process(self, dataset, show_num=0): """ For doc-level, dataset --> dataset. @@ -188,7 +198,9 @@

Source code for data_juicer.ops.deduplicator.document_deduplicator

_filter_dup_helper, fn_kwargs=dict(hashes=hashes), load_from_cache_file=False if show_num > 0 else True) # num_proc=1 - return dataset, dup_pairs
+ return dataset, dup_pairs
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html index ad84dd701..07f066f61 100644 --- a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.document_minhash_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.document_minhash_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -175,7 +175,9 @@

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicatorreturn opt -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class DocumentMinhashDeduplicator(Deduplicator): """ Deduplicator to deduplicate samples at document-level using MinHashLSH. @@ -184,7 +186,9 @@

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator kept in the final dataset. """ -
[docs] def __init__( +
+[docs] + def __init__( self, tokenization: str = 'space', window_size: PositiveInt = 5, @@ -283,7 +287,10 @@

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicatordtype=np.uint64, ).T

-
[docs] def compute_hash(self, sample): + +
+[docs] + def compute_hash(self, sample): """ Compute minhash values for the sample. @@ -347,7 +354,10 @@

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator] return sample

-
[docs] def process(self, dataset, show_num=0): + +
+[docs] + def process(self, dataset, show_num=0): """ For doc-level, dataset --> dataset. @@ -416,7 +426,9 @@

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator) logger.info(f'Keep {len(dataset)} samples after MinHash dedup.') - return dataset, dup_pairs

+ return dataset, dup_pairs
+
+

diff --git a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html index 04358f166..0f686c89b 100644 --- a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.document_simhash_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.document_simhash_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -100,11 +100,15 @@

Source code for data_juicer.ops.deduplicator.document_simhash_deduplicatorOP_NAME = 'document_simhash_deduplicator' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class DocumentSimhashDeduplicator(Deduplicator): """Deduplicator to deduplicate samples at document-level using SimHash.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, tokenization: str = 'space', window_size: PositiveInt = 6, lowercase: bool = True, @@ -153,7 +157,10 @@

Source code for data_juicer.ops.deduplicator.document_simhash_deduplicatorself.num_blocks = num_blocks self.hamming_distance = hamming_distance

-
[docs] def compute_hash(self, sample): + +
+[docs] + def compute_hash(self, sample): """ Compute simhash values for the sample. @@ -199,7 +206,10 @@

Source code for data_juicer.ops.deduplicator.document_simhash_deduplicatornp.uint64(simhash.compute(map(simhash.unsigned_hash, tokens)))) return sample

-
[docs] def process(self, dataset, show_num=0): + +
+[docs] + def process(self, dataset, show_num=0): """ For doc-level, dataset --> dataset. @@ -302,7 +312,9 @@

Source code for data_juicer.ops.deduplicator.document_simhash_deduplicatorload_from_cache_file=False if show_num > 0 else True) logger.info(f'Keep {len(dataset)} samples after SimHash dedup.') - return dataset, dup_pairs

+ return dataset, dup_pairs
+
+

diff --git a/_modules/data_juicer/ops/deduplicator/image_deduplicator.html b/_modules/data_juicer/ops/deduplicator/image_deduplicator.html index 16c59e270..9c944d20d 100644 --- a/_modules/data_juicer/ops/deduplicator/image_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/image_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.image_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.image_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -109,7 +109,9 @@

Source code for data_juicer.ops.deduplicator.image_deduplicator

return mapping[method_name] -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageDeduplicator(Deduplicator): """ @@ -117,7 +119,9 @@

Source code for data_juicer.ops.deduplicator.image_deduplicator

of images between documents. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, method: str = 'phash', consider_text: bool = False, *args, @@ -141,7 +145,10 @@

Source code for data_juicer.ops.deduplicator.image_deduplicator

if self.consider_text: self.text_dedup_op = DocumentDeduplicator(**kwargs)
-
[docs] def compute_hash(self, sample, context=False): + +
+[docs] + def compute_hash(self, sample, context=False): # get hash of text first if self.consider_text: sample = self.text_dedup_op.compute_hash(sample) @@ -165,7 +172,10 @@

Source code for data_juicer.ops.deduplicator.image_deduplicator

image_array=np.array(images[key])) return sample
-
[docs] def process(self, dataset, show_num=0): + +
+[docs] + def process(self, dataset, show_num=0): """ For doc-level, dataset --> dataset. @@ -221,7 +231,9 @@

Source code for data_juicer.ops.deduplicator.image_deduplicator

_filter_dup_helper, fn_kwargs=dict(hashes=hashes), load_from_cache_file=False if show_num > 0 else True) # num_proc=1 - return dataset, dup_pairs
+ return dataset, dup_pairs
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html index 035151e1c..426a2bf24 100644 --- a/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.ray_basic_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.ray_basic_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -87,7 +87,9 @@

Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator

redis = LazyLoader('redis', 'redis') -
[docs]class RayBasicDeduplicator(Filter): +
+[docs] +class RayBasicDeduplicator(Filter): """ A basic exact matching deduplicator for RAY. Although its functionality is deduplication, @@ -97,7 +99,9 @@

Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator

# TODO: Set a more reasonable value EMPTY_HASH_VALUE = 'EMPTY' -
[docs] def __init__(self, +
+[docs] + def __init__(self, redis_host: str = 'localhost', redis_port: PositiveInt = 6380, *args, @@ -117,11 +121,17 @@

Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator

r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0) r.flushdb(0)
-
[docs] def calculate_hash(self, sample, context=False): + +
+[docs] + def calculate_hash(self, sample, context=False): """Calculate hash value for the sample.""" raise NotImplementedError
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # init redis client r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0) # compute hash @@ -130,8 +140,13 @@

Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator

sample[HashKeys.is_duplicate] = r.setnx(md5_value, 1) return sample
-
[docs] def process_single(self, sample): - return sample[HashKeys.is_duplicate]
+ +
+[docs] + def process_single(self, sample): + return sample[HashKeys.is_duplicate]
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html index 9e2835196..02edeaee7 100644 --- a/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.ray_document_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.ray_document_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -89,13 +89,17 @@

Source code for data_juicer.ops.deduplicator.ray_document_deduplicator

< OP_NAME = 'ray_document_deduplicator' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class RayDocumentDeduplicator(RayBasicDeduplicator): """ Deduplicator to deduplicate samples at document-level using exact matching. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, redis_host: str = 'localhost', redis_port: PositiveInt = 6380, lowercase: bool = False, @@ -121,7 +125,10 @@

Source code for data_juicer.ops.deduplicator.ray_document_deduplicator

< f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605 ) if ignore_non_character else None
-
[docs] def calculate_hash(self, sample, context=False): + +
+[docs] + def calculate_hash(self, sample, context=False): if self.text_key not in sample or not sample[self.text_key]: return RayBasicDeduplicator.EMPTY_HASH_VALUE @@ -131,7 +138,9 @@

Source code for data_juicer.ops.deduplicator.ray_document_deduplicator

< if self.remove_non_character_regex: text = self.remove_non_character_regex.sub('', text) - return hashlib.md5(text.strip().encode('utf-8')).hexdigest()
+ return hashlib.md5(text.strip().encode('utf-8')).hexdigest()
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html index f637aca12..e8fab5a60 100644 --- a/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.ray_image_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.ray_image_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -106,7 +106,9 @@

Source code for data_juicer.ops.deduplicator.ray_image_deduplicator

return mapping[method_name] -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class RayImageDeduplicator(RayBasicDeduplicator): """ @@ -114,7 +116,9 @@

Source code for data_juicer.ops.deduplicator.ray_image_deduplicator

of images between documents. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, redis_host: str = 'localhost', redis_port: PositiveInt = 6380, method: str = 'phash', @@ -136,7 +140,10 @@

Source code for data_juicer.ops.deduplicator.ray_image_deduplicator

f'Can only be one of {HASH_METHOD}.') self.hasher = get_hash_method(method)()
-
[docs] def calculate_hash(self, sample, context=False): + +
+[docs] + def calculate_hash(self, sample, context=False): if self.image_key not in sample or not sample[self.image_key]: return RayBasicDeduplicator.EMPTY_HASH_VALUE @@ -151,7 +158,9 @@

Source code for data_juicer.ops.deduplicator.ray_image_deduplicator

hash_value += self.hasher.encode_image( image_array=np.array(images[key])) - return hash_value
+ return hash_value
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html index 7edbb0fcf..97aae970b 100644 --- a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.ray_video_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.ray_video_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -91,7 +91,9 @@

Source code for data_juicer.ops.deduplicator.ray_video_deduplicator

OP_NAME = 'ray_video_deduplicator' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class RayVideoDeduplicator(RayBasicDeduplicator): """ @@ -99,7 +101,9 @@

Source code for data_juicer.ops.deduplicator.ray_video_deduplicator

of videos between documents. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, redis_host: str = 'localhost', redis_port: PositiveInt = 6380, *args, @@ -116,7 +120,10 @@

Source code for data_juicer.ops.deduplicator.ray_video_deduplicator

*args, **kwargs)
-
[docs] def calculate_hash(self, sample, context=False): + +
+[docs] + def calculate_hash(self, sample, context=False): if self.video_key not in sample or not sample[self.video_key]: return RayBasicDeduplicator.EMPTY_HASH_VALUE @@ -135,7 +142,9 @@

Source code for data_juicer.ops.deduplicator.ray_video_deduplicator

for key in videos: close_video(videos[key]) - return md5_hash.hexdigest()
+ return md5_hash.hexdigest()
+
+
diff --git a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html index 3a7bf5c0c..cf051c1cc 100644 --- a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.deduplicator.video_deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator.video_deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -92,7 +92,9 @@

Source code for data_juicer.ops.deduplicator.video_deduplicator

OP_NAME = 'video_deduplicator' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoDeduplicator(Deduplicator): """ @@ -100,7 +102,9 @@

Source code for data_juicer.ops.deduplicator.video_deduplicator

of videos between documents. """ -
[docs] def __init__(self, consider_text: bool = False, *args, **kwargs): +
+[docs] + def __init__(self, consider_text: bool = False, *args, **kwargs): """ Initialization. @@ -115,7 +119,10 @@

Source code for data_juicer.ops.deduplicator.video_deduplicator

if self.consider_text: self.text_dedup_op = DocumentDeduplicator(**kwargs)
-
[docs] def compute_hash(self, sample, context=False): + +
+[docs] + def compute_hash(self, sample, context=False): # get hash of text first if self.consider_text: sample = self.text_dedup_op.compute_hash(sample) @@ -147,7 +154,10 @@

Source code for data_juicer.ops.deduplicator.video_deduplicator

sample[HashKeys.videohash] = md5_hash.hexdigest() return sample
-
[docs] def process(self, dataset, show_num=0): + +
+[docs] + def process(self, dataset, show_num=0): """ For doc-level, dataset --> dataset. @@ -203,7 +213,9 @@

Source code for data_juicer.ops.deduplicator.video_deduplicator

_filter_dup_helper, fn_kwargs=dict(hashes=hashes), load_from_cache_file=False if show_num > 0 else True) # num_proc=1 - return dataset, dup_pairs
+ return dataset, dup_pairs
+
+
diff --git a/_modules/data_juicer/ops/filter/alphanumeric_filter.html b/_modules/data_juicer/ops/filter/alphanumeric_filter.html index 829df732f..d618ef9a6 100644 --- a/_modules/data_juicer/ops/filter/alphanumeric_filter.html +++ b/_modules/data_juicer/ops/filter/alphanumeric_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.alphanumeric_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.alphanumeric_filter — data_juicer 1.0.1 documentation - - - + + + @@ -88,14 +88,18 @@

Source code for data_juicer.ops.filter.alphanumeric_filter

OP_NAME = 'alphanumeric_filter' -
[docs]@OPERATORS.register_module('alphanumeric_filter') +
+[docs] +@OPERATORS.register_module('alphanumeric_filter') class AlphanumericFilter(Filter): """Filter to keep samples with alphabet/numeric ratio within a specific range.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = sys.maxsize, @@ -129,7 +133,10 @@

Source code for data_juicer.ops.filter.alphanumeric_filter

pretrained_model_name_or_path='EleutherAI/pythia-6.9b-deduped', return_model=False)
-
[docs] def compute_stats_batched(self, samples): + +
+[docs] + def compute_stats_batched(self, samples): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] @@ -157,7 +164,10 @@

Source code for data_juicer.ops.filter.alphanumeric_filter

return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): ratio_key = StatsKeys.alpha_token_ratio if self.tokenization \ else StatsKeys.alnum_ratio if isinstance(samples[Fields.stats], list): @@ -170,7 +180,9 @@

Source code for data_juicer.ops.filter.alphanumeric_filter

Fields.stats][ratio_key] <= self.max_ratio: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/audio_duration_filter.html b/_modules/data_juicer/ops/filter/audio_duration_filter.html index 70880f701..73044f949 100644 --- a/_modules/data_juicer/ops/filter/audio_duration_filter.html +++ b/_modules/data_juicer/ops/filter/audio_duration_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.audio_duration_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.audio_duration_filter — data_juicer 1.0.1 documentation - - - + + + @@ -91,13 +91,17 @@

Source code for data_juicer.ops.filter.audio_duration_filter

OP_NAME = 'audio_duration_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_AUDIOS.register_module(OP_NAME) class AudioDurationFilter(Filter): """Keep data samples whose audios' durations are within a specified range. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_duration: int = 0, max_duration: int = sys.maxsize, any_or_all: str = 'any', @@ -125,7 +129,10 @@

Source code for data_juicer.ops.filter.audio_duration_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.audio_duration in sample[Fields.stats]: return sample @@ -153,7 +160,10 @@

Source code for data_juicer.ops.filter.audio_duration_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): audio_durations = sample[Fields.stats][StatsKeys.audio_duration] keep_bools = np.array([ self.min_duration <= duration <= self.max_duration @@ -166,7 +176,9 @@

Source code for data_juicer.ops.filter.audio_duration_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html b/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html index 79ce82297..3b9761576 100644 --- a/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html +++ b/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.audio_nmf_snr_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.audio_nmf_snr_filter — data_juicer 1.0.1 documentation - - - + + + @@ -136,14 +136,18 @@

Source code for data_juicer.ops.filter.audio_nmf_snr_filter

return snr -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_AUDIOS.register_module(OP_NAME) class AudioNMFSNRFilter(Filter): """Keep data samples whose audios' SNRs (computed based on NMF) are within a specified range. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_snr: float = 0, max_snr: float = sys.maxsize, nmf_iter_num: PositiveInt = 500, @@ -175,7 +179,10 @@

Source code for data_juicer.ops.filter.audio_nmf_snr_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.audio_nmf_snr in sample[Fields.stats]: return sample @@ -203,7 +210,10 @@

Source code for data_juicer.ops.filter.audio_nmf_snr_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): audio_snrs = sample[Fields.stats][StatsKeys.audio_nmf_snr] keep_bools = np.array( [self.min_snr <= snr <= self.max_snr for snr in audio_snrs]) @@ -214,7 +224,9 @@

Source code for data_juicer.ops.filter.audio_nmf_snr_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/audio_size_filter.html b/_modules/data_juicer/ops/filter/audio_size_filter.html index 145310f0f..1aef7b8fd 100644 --- a/_modules/data_juicer/ops/filter/audio_size_filter.html +++ b/_modules/data_juicer/ops/filter/audio_size_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.audio_size_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.audio_size_filter — data_juicer 1.0.1 documentation - - - + + + @@ -85,13 +85,17 @@

Source code for data_juicer.ops.filter.audio_size_filter

from ..base_op import OPERATORS, Filter -
[docs]@OPERATORS.register_module('audio_size_filter') +
+[docs] +@OPERATORS.register_module('audio_size_filter') class AudioSizeFilter(Filter): """Keep data samples whose audio size (in bytes/kb/MB/...) within a specific range. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', @@ -119,7 +123,10 @@

Source code for data_juicer.ops.filter.audio_size_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.audio_sizes in sample[Fields.stats]: return sample @@ -137,7 +144,10 @@

Source code for data_juicer.ops.filter.audio_size_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): audio_sizes = sample[Fields.stats][StatsKeys.audio_sizes] keep_bools = np.array([ self.min_size <= audio_size <= self.max_size @@ -150,7 +160,9 @@

Source code for data_juicer.ops.filter.audio_size_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/average_line_length_filter.html b/_modules/data_juicer/ops/filter/average_line_length_filter.html index ebfe6ae63..1116dcbf6 100644 --- a/_modules/data_juicer/ops/filter/average_line_length_filter.html +++ b/_modules/data_juicer/ops/filter/average_line_length_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.average_line_length_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.average_line_length_filter — data_juicer 1.0.1 documentation - - - + + + @@ -87,7 +87,9 @@

Source code for data_juicer.ops.filter.average_line_length_filter

OP_NAME = 'average_line_length_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @INTER_LINES.register_module(OP_NAME) class AverageLineLengthFilter(Filter): """Filter to keep samples with average line length within a specific @@ -95,7 +97,9 @@

Source code for data_juicer.ops.filter.average_line_length_filter

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_len: int = 10, max_len: int = sys.maxsize, *args, @@ -116,7 +120,10 @@

Source code for data_juicer.ops.filter.average_line_length_filter

self.min_len = min_len self.max_len = max_len
-
[docs] def compute_stats_batched(self, samples, context=False): + +
+[docs] + def compute_stats_batched(self, samples, context=False): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] context_key = f'{InterVars.lines}' @@ -137,7 +144,10 @@

Source code for data_juicer.ops.filter.average_line_length_filter

len(cur_text) / len(lines) if len(lines) != 0 else 0.0 return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if isinstance(samples[Fields.stats], list): return map( lambda stat: self.min_len <= stat[StatsKeys.avg_line_length] <= @@ -148,7 +158,9 @@

Source code for data_juicer.ops.filter.average_line_length_filter

StatsKeys.avg_line_length] <= self.max_len: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/character_repetition_filter.html b/_modules/data_juicer/ops/filter/character_repetition_filter.html index a8467ef34..c172d94ae 100644 --- a/_modules/data_juicer/ops/filter/character_repetition_filter.html +++ b/_modules/data_juicer/ops/filter/character_repetition_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.character_repetition_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.character_repetition_filter — data_juicer 1.0.1 documentation - - - + + + @@ -89,14 +89,18 @@

Source code for data_juicer.ops.filter.character_repetition_filter

from ..base_op import OPERATORS, Filter -
[docs]@OPERATORS.register_module('character_repetition_filter') +
+[docs] +@OPERATORS.register_module('character_repetition_filter') class CharacterRepetitionFilter(Filter): """Filter to keep samples with char-level n-gram repetition ratio within a specific range.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, rep_len: PositiveInt = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, @@ -120,7 +124,10 @@

Source code for data_juicer.ops.filter.character_repetition_filter

self.min_ratio = min_ratio self.max_ratio = max_ratio
-
[docs] def compute_stats_batched(self, samples): + +
+[docs] + def compute_stats_batched(self, samples): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] @@ -157,7 +164,10 @@

Source code for data_juicer.ops.filter.character_repetition_filter

return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if isinstance(samples[Fields.stats], list): return map( lambda stat: self.min_ratio <= stat[StatsKeys.char_rep_ratio] @@ -168,7 +178,9 @@

Source code for data_juicer.ops.filter.character_repetition_filter

StatsKeys.char_rep_ratio] <= self.max_ratio: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/flagged_words_filter.html b/_modules/data_juicer/ops/filter/flagged_words_filter.html index 7511dd295..5e4a3edf1 100644 --- a/_modules/data_juicer/ops/filter/flagged_words_filter.html +++ b/_modules/data_juicer/ops/filter/flagged_words_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.flagged_words_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.flagged_words_filter — data_juicer 1.0.1 documentation - - - + + + @@ -97,13 +97,17 @@

Source code for data_juicer.ops.filter.flagged_words_filter

OP_NAME = 'flagged_words_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) class FlaggedWordFilter(Filter): """Filter to keep samples with flagged-word ratio less than a specific max value.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, @@ -151,7 +155,10 @@

Source code for data_juicer.ops.filter.flagged_words_filter

self.model_key = prepare_model(model_type='sentencepiece', lang=lang)
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.flagged_words_ratio in sample[Fields.stats]: return sample @@ -198,9 +205,14 @@

Source code for data_juicer.ops.filter.flagged_words_filter

StatsKeys.flagged_words_ratio] = flagged_words_ratio return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): return sample[Fields.stats][ - StatsKeys.flagged_words_ratio] <= self.max_ratio
+ StatsKeys.flagged_words_ratio] <= self.max_ratio
+
+
diff --git a/_modules/data_juicer/ops/filter/image_aesthetics_filter.html b/_modules/data_juicer/ops/filter/image_aesthetics_filter.html index 942d25982..9d715363f 100644 --- a/_modules/data_juicer/ops/filter/image_aesthetics_filter.html +++ b/_modules/data_juicer/ops/filter/image_aesthetics_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_aesthetics_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_aesthetics_filter — data_juicer 1.0.1 documentation - - - + + + @@ -93,7 +93,9 @@

Source code for data_juicer.ops.filter.image_aesthetics_filter

OP_NAME = 'image_aesthetics_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageAestheticsFilter(Filter): """Filter to keep samples with aesthetics scores within a specific range. @@ -101,7 +103,9 @@

Source code for data_juicer.ops.filter.image_aesthetics_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, @@ -146,7 +150,10 @@

Source code for data_juicer.ops.filter.image_aesthetics_filter

self.need_normalized_by_ten = ('shunk031/aesthetics-predictor' in hf_scorer_model)
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.image_aesthetics_scores in sample[Fields.stats]: return sample @@ -183,7 +190,10 @@

Source code for data_juicer.ops.filter.image_aesthetics_filter

aesthetics_scores return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): aesthetics_scores = ( sample)[Fields.stats][StatsKeys.image_aesthetics_scores] if len(aesthetics_scores) <= 0: @@ -198,7 +208,9 @@

Source code for data_juicer.ops.filter.image_aesthetics_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_aspect_ratio_filter.html b/_modules/data_juicer/ops/filter/image_aspect_ratio_filter.html index c0f04376d..a1a2d233b 100644 --- a/_modules/data_juicer/ops/filter/image_aspect_ratio_filter.html +++ b/_modules/data_juicer/ops/filter/image_aspect_ratio_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_aspect_ratio_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_aspect_ratio_filter — data_juicer 1.0.1 documentation - - - + + + @@ -86,7 +86,9 @@

Source code for data_juicer.ops.filter.image_aspect_ratio_filter

from ..op_fusion import LOADED_IMAGES -
[docs]@OPERATORS.register_module('image_aspect_ratio_filter') +
+[docs] +@OPERATORS.register_module('image_aspect_ratio_filter') @LOADED_IMAGES.register_module('image_aspect_ratio_filter') class ImageAspectRatioFilter(Filter): """Filter to keep samples with image aspect ratio within a specific range. @@ -95,7 +97,9 @@

Source code for data_juicer.ops.filter.image_aspect_ratio_filter

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', @@ -121,7 +125,10 @@

Source code for data_juicer.ops.filter.image_aspect_ratio_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.aspect_ratios in sample[Fields.stats]: return sample @@ -147,7 +154,10 @@

Source code for data_juicer.ops.filter.image_aspect_ratio_filter

] return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): aspect_ratios = sample[Fields.stats][StatsKeys.aspect_ratios] keep_bools = np.array([ self.min_ratio <= aspect_ratio <= self.max_ratio @@ -160,7 +170,9 @@

Source code for data_juicer.ops.filter.image_aspect_ratio_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_face_count_filter.html b/_modules/data_juicer/ops/filter/image_face_count_filter.html index 2a61c0c25..30f611393 100644 --- a/_modules/data_juicer/ops/filter/image_face_count_filter.html +++ b/_modules/data_juicer/ops/filter/image_face_count_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_face_count_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_face_count_filter — data_juicer 1.0.1 documentation - - - + + + @@ -96,7 +96,9 @@

Source code for data_juicer.ops.filter.image_face_count_filter

OP_NAME = 'image_face_count_filter' -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageFaceCountFilter(Filter): @@ -110,7 +112,9 @@

Source code for data_juicer.ops.filter.image_face_count_filter

'maxSize': None, } -
[docs] def __init__(self, +
+[docs] + def __init__(self, cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, @@ -153,7 +157,10 @@

Source code for data_juicer.ops.filter.image_face_count_filter

self.model_key = prepare_model(model_type='opencv_classifier', model_path=cv_classifier)
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.face_ratios in sample[Fields.stats]: return sample @@ -186,7 +193,10 @@

Source code for data_juicer.ops.filter.image_face_count_filter

] return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): face_counts = sample[Fields.stats][StatsKeys.face_counts] if len(face_counts) <= 0: return True @@ -200,7 +210,9 @@

Source code for data_juicer.ops.filter.image_face_count_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_face_ratio_filter.html b/_modules/data_juicer/ops/filter/image_face_ratio_filter.html index 5e75f7539..fef2b546a 100644 --- a/_modules/data_juicer/ops/filter/image_face_ratio_filter.html +++ b/_modules/data_juicer/ops/filter/image_face_ratio_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_face_ratio_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_face_ratio_filter — data_juicer 1.0.1 documentation - - - + + + @@ -96,7 +96,9 @@

Source code for data_juicer.ops.filter.image_face_ratio_filter

OP_NAME = 'image_face_ratio_filter' -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageFaceRatioFilter(Filter): @@ -110,7 +112,9 @@

Source code for data_juicer.ops.filter.image_face_ratio_filter

'maxSize': None, } -
[docs] def __init__(self, +
+[docs] + def __init__(self, cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, @@ -152,7 +156,10 @@

Source code for data_juicer.ops.filter.image_face_ratio_filter

self.model_key = prepare_model(model_type='opencv_classifier', model_path=cv_classifier)
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.face_ratios in sample[Fields.stats]: return sample @@ -190,7 +197,10 @@

Source code for data_juicer.ops.filter.image_face_ratio_filter

] return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): face_ratios = sample[Fields.stats][StatsKeys.face_ratios] if len(face_ratios) <= 0: return True @@ -204,7 +214,9 @@

Source code for data_juicer.ops.filter.image_face_ratio_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_nsfw_filter.html b/_modules/data_juicer/ops/filter/image_nsfw_filter.html index 95f742d2d..777931790 100644 --- a/_modules/data_juicer/ops/filter/image_nsfw_filter.html +++ b/_modules/data_juicer/ops/filter/image_nsfw_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_nsfw_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_nsfw_filter — data_juicer 1.0.1 documentation - - - + + + @@ -92,14 +92,18 @@

Source code for data_juicer.ops.filter.image_nsfw_filter

OP_NAME = 'image_nsfw_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageNSFWFilter(Filter): """Filter to keep samples whose images have low nsfw scores.""" _accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, @@ -131,7 +135,10 @@

Source code for data_juicer.ops.filter.image_nsfw_filter

pretrained_model_name_or_path=hf_nsfw_model, trust_remote_code=trust_remote_code)
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.image_nsfw_score in sample[Fields.stats]: return sample @@ -161,7 +168,10 @@

Source code for data_juicer.ops.filter.image_nsfw_filter

return sample
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): itm_scores = sample[Fields.stats][StatsKeys.image_nsfw_score] if len(itm_scores) <= 0: return True @@ -173,7 +183,9 @@

Source code for data_juicer.ops.filter.image_nsfw_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_pair_similarity_filter.html b/_modules/data_juicer/ops/filter/image_pair_similarity_filter.html index f3cdc1593..1437d6d6f 100644 --- a/_modules/data_juicer/ops/filter/image_pair_similarity_filter.html +++ b/_modules/data_juicer/ops/filter/image_pair_similarity_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_pair_similarity_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_pair_similarity_filter — data_juicer 1.0.1 documentation - - - + + + @@ -92,7 +92,9 @@

Source code for data_juicer.ops.filter.image_pair_similarity_filter

OP_NAME = 'image_pair_similarity_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImagePairSimilarityFilter(Filter): """Filter to keep image pairs with similarities between images @@ -100,7 +102,9 @@

Source code for data_juicer.ops.filter.image_pair_similarity_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, @@ -133,7 +137,10 @@

Source code for data_juicer.ops.filter.image_pair_similarity_filter

pretrained_model_name_or_path=hf_clip, trust_remote_code=trust_remote_code)
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.image_pair_similarity in sample[Fields.stats]: @@ -170,7 +177,10 @@

Source code for data_juicer.ops.filter.image_pair_similarity_filter

return sample
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): similarity = sample[Fields.stats][StatsKeys.image_pair_similarity] if len(similarity) <= 0: return True @@ -184,7 +194,9 @@

Source code for data_juicer.ops.filter.image_pair_similarity_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_shape_filter.html b/_modules/data_juicer/ops/filter/image_shape_filter.html index f6a777dc8..3c418a35c 100644 --- a/_modules/data_juicer/ops/filter/image_shape_filter.html +++ b/_modules/data_juicer/ops/filter/image_shape_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_shape_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_shape_filter — data_juicer 1.0.1 documentation - - - + + + @@ -88,7 +88,9 @@

Source code for data_juicer.ops.filter.image_shape_filter

from ..op_fusion import LOADED_IMAGES -
[docs]@OPERATORS.register_module('image_shape_filter') +
+[docs] +@OPERATORS.register_module('image_shape_filter') @LOADED_IMAGES.register_module('image_shape_filter') class ImageShapeFilter(Filter): """Filter to keep samples with image shape (w, h) within specific ranges. @@ -96,7 +98,9 @@

Source code for data_juicer.ops.filter.image_shape_filter

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_width: int = 1, max_width: int = sys.maxsize, min_height: int = 1, @@ -128,7 +132,10 @@

Source code for data_juicer.ops.filter.image_shape_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.image_width in sample[Fields.stats] \ and StatsKeys.image_height in sample[Fields.stats]: @@ -157,7 +164,10 @@

Source code for data_juicer.ops.filter.image_shape_filter

] return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): ws = sample[Fields.stats][StatsKeys.image_width] hs = sample[Fields.stats][StatsKeys.image_height] if len(ws) <= 0: @@ -172,7 +182,9 @@

Source code for data_juicer.ops.filter.image_shape_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_size_filter.html b/_modules/data_juicer/ops/filter/image_size_filter.html index d4f1d18a5..718972a7a 100644 --- a/_modules/data_juicer/ops/filter/image_size_filter.html +++ b/_modules/data_juicer/ops/filter/image_size_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_size_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_size_filter — data_juicer 1.0.1 documentation - - - + + + @@ -85,7 +85,9 @@

Source code for data_juicer.ops.filter.image_size_filter

from ..base_op import OPERATORS, Filter -
[docs]@OPERATORS.register_module('image_size_filter') +
+[docs] +@OPERATORS.register_module('image_size_filter') class ImageSizeFilter(Filter): """Keep data samples whose image size (in Bytes/KB/MB/...) within a specific range. @@ -93,7 +95,9 @@

Source code for data_juicer.ops.filter.image_size_filter

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', @@ -121,7 +125,10 @@

Source code for data_juicer.ops.filter.image_size_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.image_sizes in sample[Fields.stats]: return sample @@ -139,7 +146,10 @@

Source code for data_juicer.ops.filter.image_size_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): image_sizes = sample[Fields.stats][StatsKeys.image_sizes] keep_bools = np.array([ self.min_size <= image_size <= self.max_size @@ -152,7 +162,9 @@

Source code for data_juicer.ops.filter.image_size_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_text_matching_filter.html b/_modules/data_juicer/ops/filter/image_text_matching_filter.html index bdce95208..1747a884a 100644 --- a/_modules/data_juicer/ops/filter/image_text_matching_filter.html +++ b/_modules/data_juicer/ops/filter/image_text_matching_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_text_matching_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_text_matching_filter — data_juicer 1.0.1 documentation - - - + + + @@ -91,7 +91,9 @@

Source code for data_juicer.ops.filter.image_text_matching_filter

OP_NAME = 'image_text_matching_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageTextMatchingFilter(Filter): """Filter to keep samples those matching score between image and text @@ -99,7 +101,9 @@

Source code for data_juicer.ops.filter.image_text_matching_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, @@ -148,7 +152,10 @@

Source code for data_juicer.ops.filter.image_text_matching_filter

self.horizontal_flip = horizontal_flip self.vertical_flip = vertical_flip
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.image_text_matching_score in sample[Fields.stats]: return sample @@ -213,7 +220,10 @@

Source code for data_juicer.ops.filter.image_text_matching_filter

return sample
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): itm_scores = sample[Fields.stats][StatsKeys.image_text_matching_score] if len(itm_scores) <= 0: return True @@ -227,7 +237,9 @@

Source code for data_juicer.ops.filter.image_text_matching_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_text_similarity_filter.html b/_modules/data_juicer/ops/filter/image_text_similarity_filter.html index e0e1dad21..75d19242d 100644 --- a/_modules/data_juicer/ops/filter/image_text_similarity_filter.html +++ b/_modules/data_juicer/ops/filter/image_text_similarity_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_text_similarity_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_text_similarity_filter — data_juicer 1.0.1 documentation - - - + + + @@ -91,7 +91,9 @@

Source code for data_juicer.ops.filter.image_text_similarity_filter

OP_NAME = 'image_text_similarity_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageTextSimilarityFilter(Filter): """Filter to keep samples those similarities between image and text @@ -100,7 +102,9 @@

Source code for data_juicer.ops.filter.image_text_similarity_filter

_accelerator = 'cuda' _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, @@ -149,7 +153,10 @@

Source code for data_juicer.ops.filter.image_text_similarity_filter

self.horizontal_flip = horizontal_flip self.vertical_flip = vertical_flip
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.image_text_similarity in sample[Fields.stats]: return sample @@ -211,7 +218,10 @@

Source code for data_juicer.ops.filter.image_text_similarity_filter

return sample
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): similarity = sample[Fields.stats][StatsKeys.image_text_similarity] if len(similarity) <= 0: return True @@ -225,7 +235,9 @@

Source code for data_juicer.ops.filter.image_text_similarity_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/image_watermark_filter.html b/_modules/data_juicer/ops/filter/image_watermark_filter.html index b9c38ffc0..b16bed077 100644 --- a/_modules/data_juicer/ops/filter/image_watermark_filter.html +++ b/_modules/data_juicer/ops/filter/image_watermark_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.image_watermark_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.image_watermark_filter — data_juicer 1.0.1 documentation - - - + + + @@ -92,7 +92,9 @@

Source code for data_juicer.ops.filter.image_watermark_filter

OP_NAME = 'image_watermark_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageWatermarkFilter(Filter): """ @@ -102,7 +104,9 @@

Source code for data_juicer.ops.filter.image_watermark_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, @@ -135,7 +139,10 @@

Source code for data_juicer.ops.filter.image_watermark_filter

pretrained_model_name_or_path=hf_watermark_model, trust_remote_code=trust_remote_code)
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.image_watermark_prob in sample[Fields.stats]: return sample @@ -165,7 +172,10 @@

Source code for data_juicer.ops.filter.image_watermark_filter

return sample
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): itm_probs = sample[Fields.stats][StatsKeys.image_watermark_prob] if len(itm_probs) <= 0: return True @@ -177,7 +187,9 @@

Source code for data_juicer.ops.filter.image_watermark_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/language_id_score_filter.html b/_modules/data_juicer/ops/filter/language_id_score_filter.html index d9d13ac20..1c761fc86 100644 --- a/_modules/data_juicer/ops/filter/language_id_score_filter.html +++ b/_modules/data_juicer/ops/filter/language_id_score_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.language_id_score_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.language_id_score_filter — data_juicer 1.0.1 documentation - - - + + + @@ -92,12 +92,16 @@

Source code for data_juicer.ops.filter.language_id_score_filter

OP_NAME = 'language_id_score_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class LanguageIDScoreFilter(Filter): """Filter to keep samples in a specific language with confidence score larger than a specific min value.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: Union[str, List[str]] = '', min_score: float = 0.8, *args, @@ -124,7 +128,10 @@

Source code for data_juicer.ops.filter.language_id_score_filter

self.min_score = min_score self.model_key = prepare_model(model_type='fasttext')
-
[docs] def compute_stats_single(self, sample): + +
+[docs] + def compute_stats_single(self, sample): # check if it's computed already if StatsKeys.lang in sample[ Fields.stats] and StatsKeys.lang_score in sample[Fields.stats]: @@ -145,13 +152,18 @@

Source code for data_juicer.ops.filter.language_id_score_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): if self.lang: return sample[Fields.stats][StatsKeys.lang] in self.lang \ and sample[Fields.stats][StatsKeys.lang_score] >= \ self.min_score else: - return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
+ return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
+
+
diff --git a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html b/_modules/data_juicer/ops/filter/maximum_line_length_filter.html index 97da27917..14af08160 100644 --- a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html +++ b/_modules/data_juicer/ops/filter/maximum_line_length_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.maximum_line_length_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.maximum_line_length_filter — data_juicer 1.0.1 documentation - - - + + + @@ -87,7 +87,9 @@

Source code for data_juicer.ops.filter.maximum_line_length_filter

OP_NAME = 'maximum_line_length_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @INTER_LINES.register_module(OP_NAME) class MaximumLineLengthFilter(Filter): """Filter to keep samples with maximum line length within a specific @@ -95,7 +97,9 @@

Source code for data_juicer.ops.filter.maximum_line_length_filter

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_len: int = 10, max_len: int = sys.maxsize, *args, @@ -116,7 +120,10 @@

Source code for data_juicer.ops.filter.maximum_line_length_filter

self.min_len = min_len self.max_len = max_len
-
[docs] def compute_stats_batched(self, samples, context=False): + +
+[docs] + def compute_stats_batched(self, samples, context=False): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] context_key = f'{InterVars.lines}' @@ -138,7 +145,10 @@

Source code for data_juicer.ops.filter.maximum_line_length_filter

return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if isinstance(samples[Fields.stats], list): return map( lambda stat: self.min_len <= stat[StatsKeys.max_line_length] <= @@ -149,7 +159,9 @@

Source code for data_juicer.ops.filter.maximum_line_length_filter

StatsKeys.max_line_length] <= self.max_len: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/perplexity_filter.html b/_modules/data_juicer/ops/filter/perplexity_filter.html index 87d5eed4f..f23146c73 100644 --- a/_modules/data_juicer/ops/filter/perplexity_filter.html +++ b/_modules/data_juicer/ops/filter/perplexity_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.perplexity_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.perplexity_filter — data_juicer 1.0.1 documentation - - - + + + @@ -91,7 +91,9 @@

Source code for data_juicer.ops.filter.perplexity_filter

OP_NAME = 'perplexity_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) class PerplexityFilter(Filter): """Filter to keep samples with perplexity score less than a specific max @@ -99,7 +101,9 @@

Source code for data_juicer.ops.filter.perplexity_filter

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: str = 'en', max_ppl: float = 1500, *args, @@ -120,7 +124,10 @@

Source code for data_juicer.ops.filter.perplexity_filter

lang=lang) self.kl_model_key = prepare_model(model_type='kenlm', lang=lang)
-
[docs] def compute_stats_batched(self, samples, context=False): + +
+[docs] + def compute_stats_batched(self, samples, context=False): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] words_key = f'{InterVars.words}-{self.sp_model_key}' @@ -152,12 +159,17 @@

Source code for data_juicer.ops.filter.perplexity_filter

return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if isinstance(samples[Fields.stats], list): return map(lambda stat: stat[StatsKeys.perplexity] <= self.max_ppl, samples[Fields.stats]) else: - return samples[Fields.stats][StatsKeys.perplexity] <= self.max_ppl
+ return samples[Fields.stats][StatsKeys.perplexity] <= self.max_ppl
+
+
diff --git a/_modules/data_juicer/ops/filter/phrase_grounding_recall_filter.html b/_modules/data_juicer/ops/filter/phrase_grounding_recall_filter.html index 165d0f1e9..bae9b1bef 100644 --- a/_modules/data_juicer/ops/filter/phrase_grounding_recall_filter.html +++ b/_modules/data_juicer/ops/filter/phrase_grounding_recall_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.phrase_grounding_recall_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.phrase_grounding_recall_filter — data_juicer 1.0.1 documentation - - - + + + @@ -139,7 +139,9 @@

Source code for data_juicer.ops.filter.phrase_grounding_recall_filter

# NER algorithm adapted from GLIP ends -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class PhraseGroundingRecallFilter(Filter): """Filter to keep samples whose locating recalls of phrases extracted @@ -147,7 +149,9 @@

Source code for data_juicer.ops.filter.phrase_grounding_recall_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, @@ -219,7 +223,10 @@

Source code for data_juicer.ops.filter.phrase_grounding_recall_filter

for nltk_data_pkg in requires_nltk_data: nltk.download(nltk_data_pkg)
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.phrase_grounding_recall in sample[Fields.stats]: return sample @@ -333,7 +340,10 @@

Source code for data_juicer.ops.filter.phrase_grounding_recall_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): recalls = sample[Fields.stats][StatsKeys.phrase_grounding_recall] if len(recalls) <= 0: return True @@ -346,7 +356,9 @@

Source code for data_juicer.ops.filter.phrase_grounding_recall_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/special_characters_filter.html b/_modules/data_juicer/ops/filter/special_characters_filter.html index fd9c118b7..eac0fc18d 100644 --- a/_modules/data_juicer/ops/filter/special_characters_filter.html +++ b/_modules/data_juicer/ops/filter/special_characters_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.special_characters_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.special_characters_filter — data_juicer 1.0.1 documentation - - - + + + @@ -87,14 +87,18 @@

Source code for data_juicer.ops.filter.special_characters_filter

from ..common import SPECIAL_CHARACTERS -
[docs]@OPERATORS.register_module('special_characters_filter') +
+[docs] +@OPERATORS.register_module('special_characters_filter') class SpecialCharactersFilter(Filter): """Filter to keep samples with special-char ratio within a specific range.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_ratio: float = 0.0, max_ratio: float = 0.25, *args, @@ -115,7 +119,10 @@

Source code for data_juicer.ops.filter.special_characters_filter

self.min_ratio = min_ratio self.max_ratio = max_ratio
-
[docs] def compute_stats_batched(self, samples): + +
+[docs] + def compute_stats_batched(self, samples): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] @@ -131,7 +138,10 @@

Source code for data_juicer.ops.filter.special_characters_filter

return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if isinstance(samples[Fields.stats], list): return map( lambda stat: self.min_ratio <= stat[ @@ -144,7 +154,9 @@

Source code for data_juicer.ops.filter.special_characters_filter

<= self.max_ratio: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/specified_field_filter.html b/_modules/data_juicer/ops/filter/specified_field_filter.html index 15f3074cf..4576f4856 100644 --- a/_modules/data_juicer/ops/filter/specified_field_filter.html +++ b/_modules/data_juicer/ops/filter/specified_field_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.specified_field_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.specified_field_filter — data_juicer 1.0.1 documentation - - - + + + @@ -82,7 +82,9 @@

Source code for data_juicer.ops.filter.specified_field_filter

from ..base_op import OPERATORS, Filter -
[docs]@OPERATORS.register_module('specified_field_filter') +
+[docs] +@OPERATORS.register_module('specified_field_filter') class SpecifiedFieldFilter(Filter): """ Filter based on specified field information. @@ -91,7 +93,9 @@

Source code for data_juicer.ops.filter.specified_field_filter

specified target value, the sample will be filtered. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, field_key: str = '', target_value: List = [], *args, @@ -112,10 +116,16 @@

Source code for data_juicer.ops.filter.specified_field_filter

self.field_key = field_key self.target_value = target_value
-
[docs] def compute_stats_single(self, sample): + +
+[docs] + def compute_stats_single(self, sample): return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): if not (self.field_key and self.target_value): return True @@ -131,7 +141,9 @@

Source code for data_juicer.ops.filter.specified_field_filter

for value in field_value: if value not in self.target_value: return False - return True
+ return True
+
+
diff --git a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html b/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html index e21844927..12911c5d2 100644 --- a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html +++ b/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.specified_numeric_field_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.specified_numeric_field_filter — data_juicer 1.0.1 documentation - - - + + + @@ -92,7 +92,9 @@

Source code for data_juicer.ops.filter.specified_numeric_field_filter

return False -
[docs]@OPERATORS.register_module('specified_numeric_field_filter') +
+[docs] +@OPERATORS.register_module('specified_numeric_field_filter') class SpecifiedNumericFieldFilter(Filter): """ Filter based on specified numeric field information. @@ -101,7 +103,9 @@

Source code for data_juicer.ops.filter.specified_numeric_field_filter

specified range, the sample will be filtered. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, field_key: str = '', min_value: float = -sys.maxsize, max_value: float = sys.maxsize, @@ -128,10 +132,16 @@

Source code for data_juicer.ops.filter.specified_numeric_field_filter

self.min_value = min_value self.max_value = max_value
-
[docs] def compute_stats_single(self, sample): + +
+[docs] + def compute_stats_single(self, sample): return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): if not self.field_key: return True @@ -145,7 +155,9 @@

Source code for data_juicer.ops.filter.specified_numeric_field_filter

field_value = float(field_value) return self.min_value <= field_value <= self.max_value else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/stopwords_filter.html b/_modules/data_juicer/ops/filter/stopwords_filter.html index 8f23d0d3c..a1b778294 100644 --- a/_modules/data_juicer/ops/filter/stopwords_filter.html +++ b/_modules/data_juicer/ops/filter/stopwords_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.stopwords_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.stopwords_filter — data_juicer 1.0.1 documentation - - - + + + @@ -97,13 +97,17 @@

Source code for data_juicer.ops.filter.stopwords_filter

OP_NAME = 'stopwords_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) class StopWordsFilter(Filter): """Filter to keep samples with stopword ratio larger than a specific min value.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, @@ -149,7 +153,10 @@

Source code for data_juicer.ops.filter.stopwords_filter

self.model_key = prepare_model(model_type='sentencepiece', lang=lang)
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.stopwords_ratio in sample[Fields.stats]: return sample @@ -196,9 +203,14 @@

Source code for data_juicer.ops.filter.stopwords_filter

sample[Fields.stats][StatsKeys.stopwords_ratio] = stopwords_ratio return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): return sample[Fields.stats][ - StatsKeys.stopwords_ratio] >= self.min_ratio
+ StatsKeys.stopwords_ratio] >= self.min_ratio
+
+
diff --git a/_modules/data_juicer/ops/filter/suffix_filter.html b/_modules/data_juicer/ops/filter/suffix_filter.html index 388d70932..f035e2af4 100644 --- a/_modules/data_juicer/ops/filter/suffix_filter.html +++ b/_modules/data_juicer/ops/filter/suffix_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.suffix_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.suffix_filter — data_juicer 1.0.1 documentation - - - + + + @@ -84,11 +84,15 @@

Source code for data_juicer.ops.filter.suffix_filter

from ..base_op import OPERATORS, Filter -
[docs]@OPERATORS.register_module('suffix_filter') +
+[docs] +@OPERATORS.register_module('suffix_filter') class SuffixFilter(Filter): """Filter to keep samples with specified suffix.""" -
[docs] def __init__(self, suffixes: Union[str, List[str]] = [], *args, **kwargs): +
+[docs] + def __init__(self, suffixes: Union[str, List[str]] = [], *args, **kwargs): """ Initialization method. @@ -105,17 +109,25 @@

Source code for data_juicer.ops.filter.suffix_filter

else: self.suffixes = suffixes
-
[docs] def compute_stats_single(self, sample): + +
+[docs] + def compute_stats_single(self, sample): return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): if self.suffixes: if sample[Fields.suffix] in self.suffixes: return True else: return False else: - return True
+ return True
+
+
diff --git a/_modules/data_juicer/ops/filter/text_action_filter.html b/_modules/data_juicer/ops/filter/text_action_filter.html index 7f28c6686..cfededbf5 100644 --- a/_modules/data_juicer/ops/filter/text_action_filter.html +++ b/_modules/data_juicer/ops/filter/text_action_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.text_action_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.text_action_filter — data_juicer 1.0.1 documentation - - - + + + @@ -87,13 +87,17 @@

Source code for data_juicer.ops.filter.text_action_filter

OP_NAME = 'text_action_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class TextActionFilter(Filter): """ Filter to keep texts those contain actions in the text. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: str = 'en', min_action_num: int = 1, *args, @@ -121,7 +125,10 @@

Source code for data_juicer.ops.filter.text_action_filter

self.action_tags = ['VV', 'VB', 'VBP', 'VBZ', 'VBD', 'VBG', 'VBN'] self.min_action_num = min_action_num
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.num_action in sample[Fields.stats]: return sample @@ -140,12 +147,17 @@

Source code for data_juicer.ops.filter.text_action_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): num_action = sample[Fields.stats][StatsKeys.num_action] if self.min_action_num <= num_action: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/text_entity_dependency_filter.html b/_modules/data_juicer/ops/filter/text_entity_dependency_filter.html index 315fbccc1..2d14d4430 100644 --- a/_modules/data_juicer/ops/filter/text_entity_dependency_filter.html +++ b/_modules/data_juicer/ops/filter/text_entity_dependency_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.text_entity_dependency_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.text_entity_dependency_filter — data_juicer 1.0.1 documentation - - - + + + @@ -89,14 +89,18 @@

Source code for data_juicer.ops.filter.text_entity_dependency_filter

OP_NAME = 'text_entity_dependency_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class TextEntityDependencyFilter(Filter): """ Identify the entities in the text which are independent with other token, and filter them. The text containing no entities will be omitted. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', @@ -132,7 +136,10 @@

Source code for data_juicer.ops.filter.text_entity_dependency_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.num_dependency_edges in sample[Fields.stats]: return sample @@ -167,7 +174,10 @@

Source code for data_juicer.ops.filter.text_entity_dependency_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): num_dependency_edges = sample[Fields.stats][ StatsKeys.num_dependency_edges] keep_bools = np.array([ @@ -182,7 +192,9 @@

Source code for data_juicer.ops.filter.text_entity_dependency_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/text_length_filter.html b/_modules/data_juicer/ops/filter/text_length_filter.html index 9eb8a2b8f..a1d6dddcb 100644 --- a/_modules/data_juicer/ops/filter/text_length_filter.html +++ b/_modules/data_juicer/ops/filter/text_length_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.text_length_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.text_length_filter — data_juicer 1.0.1 documentation - - - + + + @@ -84,14 +84,18 @@

Source code for data_juicer.ops.filter.text_length_filter

from ..base_op import OPERATORS, Filter -
[docs]@OPERATORS.register_module('text_length_filter') +
+[docs] +@OPERATORS.register_module('text_length_filter') class TextLengthFilter(Filter): """Filter to keep samples with total text length within a specific range.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_len: int = 10, max_len: int = sys.maxsize, *args, @@ -112,7 +116,10 @@

Source code for data_juicer.ops.filter.text_length_filter

self.min_len = min_len self.max_len = max_len
-
[docs] def compute_stats_batched(self, samples): + +
+[docs] + def compute_stats_batched(self, samples): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] for i, stat in enumerate(samples_stats): @@ -124,7 +131,10 @@

Source code for data_juicer.ops.filter.text_length_filter

return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if isinstance(samples[Fields.stats], list): return map( lambda stat: self.min_len <= stat[StatsKeys.text_len] <= self. @@ -135,7 +145,9 @@

Source code for data_juicer.ops.filter.text_length_filter

StatsKeys.text_len] <= self.max_len: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/token_num_filter.html b/_modules/data_juicer/ops/filter/token_num_filter.html index 5b2b98f4f..c314c4596 100644 --- a/_modules/data_juicer/ops/filter/token_num_filter.html +++ b/_modules/data_juicer/ops/filter/token_num_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.token_num_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.token_num_filter — data_juicer 1.0.1 documentation - - - + + + @@ -88,12 +88,16 @@

Source code for data_juicer.ops.filter.token_num_filter

OP_NAME = 'token_num_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class TokenNumFilter(Filter): """Filter to keep samples with total token number within a specific range.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = sys.maxsize, @@ -121,7 +125,10 @@

Source code for data_juicer.ops.filter.token_num_filter

pretrained_model_name_or_path=hf_tokenizer, return_model=False)
-
[docs] def compute_stats_single(self, sample): + +
+[docs] + def compute_stats_single(self, sample): # check if it's computed already if StatsKeys.num_token in sample[Fields.stats]: return sample @@ -133,12 +140,17 @@

Source code for data_juicer.ops.filter.token_num_filter

sample[Fields.stats][StatsKeys.num_token] = len(tokens) return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): if self.min_num <= sample[Fields.stats][ StatsKeys.num_token] <= self.max_num: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html index f14962d90..434b4056c 100644 --- a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html +++ b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_aesthetics_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_aesthetics_filter — data_juicer 1.0.1 documentation - - - + + + @@ -96,7 +96,9 @@

Source code for data_juicer.ops.filter.video_aesthetics_filter

OP_NAME = 'video_aesthetics_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) class VideoAestheticsFilter(Filter): @@ -106,7 +108,9 @@

Source code for data_juicer.ops.filter.video_aesthetics_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, @@ -189,7 +193,10 @@

Source code for data_juicer.ops.filter.video_aesthetics_filter

('' if frame_sampling_method == 'all_keyframes' else f'-{frame_num}')
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.video_frames_aesthetics_score in sample[Fields.stats]: return sample @@ -264,7 +271,10 @@

Source code for data_juicer.ops.filter.video_aesthetics_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): aesthetics_scores = ( sample)[Fields.stats][StatsKeys.video_frames_aesthetics_score] if len(aesthetics_scores) <= 0: @@ -279,7 +289,9 @@

Source code for data_juicer.ops.filter.video_aesthetics_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html b/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html index 22234f24b..3f4468901 100644 --- a/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html +++ b/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_aspect_ratio_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_aspect_ratio_filter — data_juicer 1.0.1 documentation - - - + + + @@ -89,14 +89,18 @@

Source code for data_juicer.ops.filter.video_aspect_ratio_filter

from ..op_fusion import LOADED_VIDEOS -
[docs]@OPERATORS.register_module('video_aspect_ratio_filter') +
+[docs] +@OPERATORS.register_module('video_aspect_ratio_filter') @LOADED_VIDEOS.register_module('video_aspect_ratio_filter') class VideoAspectRatioFilter(Filter): """Filter to keep samples with video aspect ratio within a specific range. AspectRatio = W / H. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', @@ -124,7 +128,10 @@

Source code for data_juicer.ops.filter.video_aspect_ratio_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.video_aspect_ratios in sample[Fields.stats]: return sample @@ -155,7 +162,10 @@

Source code for data_juicer.ops.filter.video_aspect_ratio_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): video_aspect_ratios = sample[Fields.stats][ StatsKeys.video_aspect_ratios] @@ -170,7 +180,9 @@

Source code for data_juicer.ops.filter.video_aspect_ratio_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/video_duration_filter.html b/_modules/data_juicer/ops/filter/video_duration_filter.html index 0d83acc8c..3cdb9e874 100644 --- a/_modules/data_juicer/ops/filter/video_duration_filter.html +++ b/_modules/data_juicer/ops/filter/video_duration_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_duration_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_duration_filter — data_juicer 1.0.1 documentation - - - + + + @@ -91,13 +91,17 @@

Source code for data_juicer.ops.filter.video_duration_filter

OP_NAME = 'video_duration_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoDurationFilter(Filter): """Keep data samples whose videos' durations are within a specified range. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_duration: float = 0, max_duration: float = sys.maxsize, any_or_all: str = 'any', @@ -125,7 +129,10 @@

Source code for data_juicer.ops.filter.video_duration_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.video_duration in sample[Fields.stats]: return sample @@ -156,7 +163,10 @@

Source code for data_juicer.ops.filter.video_duration_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): video_durations = sample[Fields.stats][StatsKeys.video_duration] keep_bools = np.array([ self.min_duration <= duration <= self.max_duration @@ -169,7 +179,9 @@

Source code for data_juicer.ops.filter.video_duration_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html index 6bed4034f..f20a30921 100644 --- a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html +++ b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_frames_text_similarity_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_frames_text_similarity_filter — data_juicer 1.0.1 documentation - - - + + + @@ -95,7 +95,9 @@

Source code for data_juicer.ops.filter.video_frames_text_similarity_filterOP_NAME = 'video_frames_text_similarity_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) class VideoFramesTextSimilarityFilter(Filter): @@ -104,7 +106,9 @@

Source code for data_juicer.ops.filter.video_frames_text_similarity_filter_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, @@ -181,7 +185,10 @@

Source code for data_juicer.ops.filter.video_frames_text_similarity_filter('' if frame_sampling_method == 'all_keyframes' else f'-{frame_num}')

-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.video_frames_text_similarity in sample[Fields.stats]: return sample @@ -275,7 +282,10 @@

Source code for data_juicer.ops.filter.video_frames_text_similarity_filterreturn sample

-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): similarity = sample[Fields.stats][ StatsKeys.video_frames_text_similarity] if len(similarity) <= 0: @@ -290,7 +300,9 @@

Source code for data_juicer.ops.filter.video_frames_text_similarity_filterif self.any: return keep_bools.any() else: - return keep_bools.all()

+ return keep_bools.all()
+
+

diff --git a/_modules/data_juicer/ops/filter/video_motion_score_filter.html b/_modules/data_juicer/ops/filter/video_motion_score_filter.html index cc473845a..8fcce9e73 100644 --- a/_modules/data_juicer/ops/filter/video_motion_score_filter.html +++ b/_modules/data_juicer/ops/filter/video_motion_score_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_motion_score_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_motion_score_filter — data_juicer 1.0.1 documentation - - - + + + @@ -104,7 +104,9 @@

Source code for data_juicer.ops.filter.video_motion_score_filter

cap.release() -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class VideoMotionScoreFilter(Filter): """Filter to keep samples with video motion scores within a specific range. The @@ -121,7 +123,9 @@

Source code for data_juicer.ops.filter.video_motion_score_filter

'flags': 0 } -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_score: float = 0.25, max_score: float = sys.float_info.max, sampling_fps: PositiveFloat = 2, @@ -187,10 +191,16 @@

Source code for data_juicer.ops.filter.video_motion_score_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def setup_model(self, rank=None): + +
+[docs] + def setup_model(self, rank=None): self.model = cv2.calcOpticalFlowFarneback
-
[docs] def compute_flow(self, prev_frame, curr_frame): + +
+[docs] + def compute_flow(self, prev_frame, curr_frame): curr_frame = cv2.cvtColor(curr_frame, cv2.COLOR_BGR2GRAY) if prev_frame is None: flow = None @@ -199,7 +209,10 @@

Source code for data_juicer.ops.filter.video_motion_score_filter

**self.extra_kwargs) return flow, curr_frame
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): self.rank = rank # check if it's computed already @@ -279,7 +292,10 @@

Source code for data_juicer.ops.filter.video_motion_score_filter

] return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): video_motion_scores = sample[Fields.stats][ StatsKeys.video_motion_score] @@ -294,7 +310,9 @@

Source code for data_juicer.ops.filter.video_motion_score_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/video_motion_score_raft_filter.html b/_modules/data_juicer/ops/filter/video_motion_score_raft_filter.html index 76cc5499a..7920abdb6 100644 --- a/_modules/data_juicer/ops/filter/video_motion_score_raft_filter.html +++ b/_modules/data_juicer/ops/filter/video_motion_score_raft_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_motion_score_raft_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_motion_score_raft_filter — data_juicer 1.0.1 documentation - - - + + + @@ -96,7 +96,9 @@

Source code for data_juicer.ops.filter.video_motion_score_raft_filter

OP_NAME = 'video_motion_score_raft_filter' -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class VideoMotionScoreRaftFilter(VideoMotionScoreFilter): """Filter to keep samples with video motion scores within a specified range. @@ -113,7 +115,9 @@

Source code for data_juicer.ops.filter.video_motion_score_raft_filter

_accelerator = 'cuda' _default_kwargs = {} -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_score: float = 1.0, max_score: float = sys.float_info.max, sampling_fps: PositiveFloat = 2, @@ -128,7 +132,10 @@

Source code for data_juicer.ops.filter.video_motion_score_raft_filter

super().__init__(min_score, max_score, sampling_fps, size, max_size, divisible, relative, any_or_all, *args, **kwargs)
-
[docs] def setup_model(self, rank=None): + +
+[docs] + def setup_model(self, rank=None): self.model = tvm.optical_flow.raft_large( weights=tvm.optical_flow.Raft_Large_Weights.DEFAULT, progress=False) @@ -147,7 +154,10 @@

Source code for data_juicer.ops.filter.video_motion_score_raft_filter

tvt.Lambda(lambda img: img.flip(-3).unsqueeze(0)), # BGR to RGB ])
-
[docs] def compute_flow(self, prev_frame, curr_frame): + +
+[docs] + def compute_flow(self, prev_frame, curr_frame): curr_frame = self.transforms(curr_frame).to(self.device) if prev_frame is None: flow = None @@ -156,7 +166,9 @@

Source code for data_juicer.ops.filter.video_motion_score_raft_filter

flows = self.model(prev_frame, curr_frame) flow = flows[-1][0].cpu().numpy().transpose( (1, 2, 0)) # 2, H, W -> H, W, 2 - return flow, curr_frame
+ return flow, curr_frame
+
+
diff --git a/_modules/data_juicer/ops/filter/video_nsfw_filter.html b/_modules/data_juicer/ops/filter/video_nsfw_filter.html index 741f65a73..31abd5a4a 100644 --- a/_modules/data_juicer/ops/filter/video_nsfw_filter.html +++ b/_modules/data_juicer/ops/filter/video_nsfw_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_nsfw_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_nsfw_filter — data_juicer 1.0.1 documentation - - - + + + @@ -95,7 +95,9 @@

Source code for data_juicer.ops.filter.video_nsfw_filter

OP_NAME = 'video_nsfw_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) class VideoNSFWFilter(Filter): @@ -103,7 +105,9 @@

Source code for data_juicer.ops.filter.video_nsfw_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, @@ -170,7 +174,10 @@

Source code for data_juicer.ops.filter.video_nsfw_filter

('' if frame_sampling_method == 'all_keyframes' else f'-{frame_num}')
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.video_nsfw_score in sample[Fields.stats]: return sample @@ -240,7 +247,10 @@

Source code for data_juicer.ops.filter.video_nsfw_filter

return sample
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): itm_scores = sample[Fields.stats][StatsKeys.video_nsfw_score] if len(itm_scores) <= 0: return True @@ -252,7 +262,9 @@

Source code for data_juicer.ops.filter.video_nsfw_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html b/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html index a67e84e5f..38bb37f89 100644 --- a/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html +++ b/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_ocr_area_ratio_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_ocr_area_ratio_filter — data_juicer 1.0.1 documentation - - - + + + @@ -109,7 +109,9 @@

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

return tri_area -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) @@ -120,7 +122,9 @@

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: PositiveInt = 3, @@ -171,7 +175,10 @@

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

# only uniformly sampling method is supported in this OP self.sampled_frames_key_suffix = f'-uniform-{frame_sample_num}'
-
[docs] def get_reader(self, rank): + +
+[docs] + def get_reader(self, rank): if self.use_cuda(): rank = 0 if rank is None else rank device = f'cuda:{rank % cuda_device_count()}' @@ -179,7 +186,10 @@

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

self.reader.device = device return self.reader
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.video_ocr_area_ratio in sample[Fields.stats]: return sample @@ -260,7 +270,10 @@

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): video_ocr_area_ratios = sample[Fields.stats][ StatsKeys.video_ocr_area_ratio] keep_bools = np.array([ @@ -274,7 +287,9 @@

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/video_resolution_filter.html b/_modules/data_juicer/ops/filter/video_resolution_filter.html index 95eea9560..acc5fbe28 100644 --- a/_modules/data_juicer/ops/filter/video_resolution_filter.html +++ b/_modules/data_juicer/ops/filter/video_resolution_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_resolution_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_resolution_filter — data_juicer 1.0.1 documentation - - - + + + @@ -91,13 +91,17 @@

Source code for data_juicer.ops.filter.video_resolution_filter

OP_NAME = 'video_resolution_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoResolutionFilter(Filter): """Keep data samples whose videos' resolutions are within a specified range. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_width: int = 1, max_width: int = sys.maxsize, min_height: int = 1, @@ -129,7 +133,10 @@

Source code for data_juicer.ops.filter.video_resolution_filter

f'Can only be one of ["any", "all"].') self.any = (any_or_all == 'any')
-
[docs] def compute_stats_single(self, sample, context=False): + +
+[docs] + def compute_stats_single(self, sample, context=False): # check if it's computed already if StatsKeys.video_width in sample[Fields.stats] \ and StatsKeys.video_height in sample[Fields.stats]: @@ -174,7 +181,10 @@

Source code for data_juicer.ops.filter.video_resolution_filter

return sample
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): ws = sample[Fields.stats][StatsKeys.video_width] hs = sample[Fields.stats][StatsKeys.video_height] keep_bools = np.array([ @@ -189,7 +199,9 @@

Source code for data_juicer.ops.filter.video_resolution_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/video_tagging_from_frames_filter.html b/_modules/data_juicer/ops/filter/video_tagging_from_frames_filter.html index 9b881a2e6..9ed751f48 100644 --- a/_modules/data_juicer/ops/filter/video_tagging_from_frames_filter.html +++ b/_modules/data_juicer/ops/filter/video_tagging_from_frames_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_tagging_from_frames_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_tagging_from_frames_filter — data_juicer 1.0.1 documentation - - - + + + @@ -92,7 +92,9 @@

Source code for data_juicer.ops.filter.video_tagging_from_frames_filter

OP_NAME = 'video_tagging_from_frames_filter' -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoTaggingFromFramesFilter(Filter): @@ -101,7 +103,9 @@

Source code for data_juicer.ops.filter.video_tagging_from_frames_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', @@ -162,13 +166,19 @@

Source code for data_juicer.ops.filter.video_tagging_from_frames_filter

tag_field_name=self.tag_field_name, )
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): sample = self.tagging_producer.process(sample, rank, context) return sample
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): video_tags = sample[self.tag_field_name] if len(video_tags) <= 0: return True @@ -186,7 +196,9 @@

Source code for data_juicer.ops.filter.video_tagging_from_frames_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/video_watermark_filter.html b/_modules/data_juicer/ops/filter/video_watermark_filter.html index ab6213f85..4ed59c3eb 100644 --- a/_modules/data_juicer/ops/filter/video_watermark_filter.html +++ b/_modules/data_juicer/ops/filter/video_watermark_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.video_watermark_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.video_watermark_filter — data_juicer 1.0.1 documentation - - - + + + @@ -95,7 +95,9 @@

Source code for data_juicer.ops.filter.video_watermark_filter

OP_NAME = 'video_watermark_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @INTER_SAMPLED_FRAMES.register_module(OP_NAME) class VideoWatermarkFilter(Filter): @@ -106,7 +108,9 @@

Source code for data_juicer.ops.filter.video_watermark_filter

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, @@ -174,7 +178,10 @@

Source code for data_juicer.ops.filter.video_watermark_filter

('' if frame_sampling_method == 'all_keyframes' else f'-{frame_num}')
-
[docs] def compute_stats_single(self, sample, rank=None, context=False): + +
+[docs] + def compute_stats_single(self, sample, rank=None, context=False): # check if it's computed already if StatsKeys.video_watermark_prob in sample[Fields.stats]: return sample @@ -242,7 +249,10 @@

Source code for data_juicer.ops.filter.video_watermark_filter

return sample
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): itm_probs = sample[Fields.stats][StatsKeys.video_watermark_prob] if len(itm_probs) <= 0: return True @@ -254,7 +264,9 @@

Source code for data_juicer.ops.filter.video_watermark_filter

if self.any: return keep_bools.any() else: - return keep_bools.all()
+ return keep_bools.all()
+
+
diff --git a/_modules/data_juicer/ops/filter/word_repetition_filter.html b/_modules/data_juicer/ops/filter/word_repetition_filter.html index 64a06e6d5..5cbd99f5d 100644 --- a/_modules/data_juicer/ops/filter/word_repetition_filter.html +++ b/_modules/data_juicer/ops/filter/word_repetition_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.word_repetition_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.word_repetition_filter — data_juicer 1.0.1 documentation - - - + + + @@ -94,7 +94,9 @@

Source code for data_juicer.ops.filter.word_repetition_filter

OP_NAME = 'word_repetition_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) class WordRepetitionFilter(Filter): """Filter to keep samples with word-level n-gram repetition ratio within a @@ -102,7 +104,9 @@

Source code for data_juicer.ops.filter.word_repetition_filter

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: str = 'en', tokenization: bool = False, rep_len: PositiveInt = 10, @@ -136,7 +140,10 @@

Source code for data_juicer.ops.filter.word_repetition_filter

self.model_key = prepare_model(model_type='sentencepiece', lang=lang)
-
[docs] def compute_stats_batched(self, samples, context=False): + +
+[docs] + def compute_stats_batched(self, samples, context=False): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] words_key = f'{InterVars.words}-{self.model_key}' @@ -189,7 +196,10 @@

Source code for data_juicer.ops.filter.word_repetition_filter

return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if isinstance(samples[Fields.stats], list): return map( lambda stat: self.min_ratio <= stat[StatsKeys.word_rep_ratio] @@ -200,7 +210,9 @@

Source code for data_juicer.ops.filter.word_repetition_filter

StatsKeys.word_rep_ratio] <= self.max_ratio: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/filter/words_num_filter.html b/_modules/data_juicer/ops/filter/words_num_filter.html index 46f3261b4..547f553d3 100644 --- a/_modules/data_juicer/ops/filter/words_num_filter.html +++ b/_modules/data_juicer/ops/filter/words_num_filter.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.filter.words_num_filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter.words_num_filter — data_juicer 1.0.1 documentation - - - + + + @@ -90,7 +90,9 @@

Source code for data_juicer.ops.filter.words_num_filter

OP_NAME = 'words_num_filter' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) class WordsNumFilter(Filter): """Filter to keep samples with total words number within a specific @@ -98,7 +100,9 @@

Source code for data_juicer.ops.filter.words_num_filter

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: str = 'en', tokenization: bool = False, min_num: int = 10, @@ -129,7 +133,10 @@

Source code for data_juicer.ops.filter.words_num_filter

self.model_key = prepare_model(model_type='sentencepiece', lang=lang)
-
[docs] def compute_stats_batched(self, samples, context=False): + +
+[docs] + def compute_stats_batched(self, samples, context=False): samples_list = samples[self.text_key] samples_stats = samples[Fields.stats] words_key = f'{InterVars.words}-{self.model_key}' @@ -153,7 +160,10 @@

Source code for data_juicer.ops.filter.words_num_filter

return samples
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if isinstance(samples[Fields.stats], list): return map( lambda stat: self.min_num <= stat[StatsKeys.num_words] <= self. @@ -164,7 +174,9 @@

Source code for data_juicer.ops.filter.words_num_filter

StatsKeys.num_words] <= self.max_num: return True else: - return False
+ return False
+
+
diff --git a/_modules/data_juicer/ops/load.html b/_modules/data_juicer/ops/load.html index 9fab51fb1..0ceadf7ee 100644 --- a/_modules/data_juicer/ops/load.html +++ b/_modules/data_juicer/ops/load.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.load — data_juicer 1.0.0 documentation + data_juicer.ops.load — data_juicer 1.0.1 documentation - - - + + + @@ -80,7 +80,9 @@

Source code for data_juicer.ops.load

 from .base_op import OPERATORS
 
 
-
[docs]def load_ops(process_list): +
+[docs] +def load_ops(process_list): """ Load op list according to the process list from config file. @@ -100,6 +102,7 @@

Source code for data_juicer.ops.load

         op._op_cfg = op_cfg
 
     return ops
+
diff --git a/_modules/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.html b/_modules/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.html index f85e79d85..9e1dc84f2 100644 --- a/_modules/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.html +++ b/_modules/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -92,12 +92,16 @@

Source code for data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper

OP_NAME = 'audio_ffmpeg_wrapped_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class AudioFFmpegWrappedMapper(Mapper): """Simple wrapper for FFmpeg audio filters. """ -
[docs] def __init__( +
+[docs] + def __init__( self, filter_name: Optional[str] = None, filter_kwargs: Optional[Dict] = None, @@ -127,7 +131,10 @@

Source code for data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper

self.capture_stderr = capture_stderr self.overwrite_output = overwrite_output
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): # there is no audio in this sample if self.audio_key not in sample or not sample[self.audio_key]: sample[Fields.source_file] = [] @@ -162,7 +169,9 @@

Source code for data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper

sample[Fields.source_file][i] = value sample[self.audio_key] = [processed[key] for key in loaded_audio_keys] - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/calibrate_qa_mapper.html b/_modules/data_juicer/ops/mapper/calibrate_qa_mapper.html index 5f37322da..814744ced 100644 --- a/_modules/data_juicer/ops/mapper/calibrate_qa_mapper.html +++ b/_modules/data_juicer/ops/mapper/calibrate_qa_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.calibrate_qa_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.calibrate_qa_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -90,7 +90,9 @@

Source code for data_juicer.ops.mapper.calibrate_qa_mapper

# TODO: LLM-based inference. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class CalibrateQAMapper(Mapper): """ Mapper to calibrate question-answer pairs based on reference text. @@ -108,7 +110,9 @@

Source code for data_juicer.ops.mapper.calibrate_qa_mapper

DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}' DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*)' -
[docs] def __init__(self, +
+[docs] + def __init__(self, api_model: str = 'gpt-4o', *, api_endpoint: Optional[str] = None, @@ -159,7 +163,10 @@

Source code for data_juicer.ops.mapper.calibrate_qa_mapper

self.try_num = try_num
-
[docs] def build_input(self, sample): + +
+[docs] + def build_input(self, sample): reference = self.reference_template.format(sample[self.text_key]) qa_pair = self.qa_pair_template.format(sample[self.query_key], sample[self.response_key]) @@ -167,14 +174,20 @@

Source code for data_juicer.ops.mapper.calibrate_qa_mapper

qa_pair=qa_pair) return input_prompt
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): match = re.match(self.output_pattern, raw_output) if match: return match.group(1).strip(), match.group(2).strip() else: return None, None
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): client = get_model(self.model_key, rank=rank) messages = [{ @@ -198,7 +211,9 @@

Source code for data_juicer.ops.mapper.calibrate_qa_mapper

if parsed_a: sample[self.response_key] = parsed_a - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/calibrate_query_mapper.html b/_modules/data_juicer/ops/mapper/calibrate_query_mapper.html index c365534a9..446613f80 100644 --- a/_modules/data_juicer/ops/mapper/calibrate_query_mapper.html +++ b/_modules/data_juicer/ops/mapper/calibrate_query_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.calibrate_query_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.calibrate_query_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,7 +84,9 @@

Source code for data_juicer.ops.mapper.calibrate_query_mapper

# TODO: LLM-based inference. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class CalibrateQueryMapper(CalibrateQAMapper): """ Mapper to calibrate query in question-answer pairs based on reference text. @@ -93,8 +95,12 @@

Source code for data_juicer.ops.mapper.calibrate_query_mapper

DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【问题】进行校准,\ 使其更加详细、准确,且仍可以由原答案回答。只输出校准后的问题,不要输出多余内容。' -
[docs] def parse_output(self, raw_output): - return raw_output.strip(), None
+
+[docs] + def parse_output(self, raw_output): + return raw_output.strip(), None
+
+
diff --git a/_modules/data_juicer/ops/mapper/calibrate_response_mapper.html b/_modules/data_juicer/ops/mapper/calibrate_response_mapper.html index 45fb5ca59..f555e9116 100644 --- a/_modules/data_juicer/ops/mapper/calibrate_response_mapper.html +++ b/_modules/data_juicer/ops/mapper/calibrate_response_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.calibrate_response_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.calibrate_response_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,7 +84,9 @@

Source code for data_juicer.ops.mapper.calibrate_response_mapper

# TODO: LLM-based inference. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class CalibrateResponseMapper(CalibrateQAMapper): """ Mapper to calibrate response in question-answer pairs based on reference text. @@ -93,8 +95,12 @@

Source code for data_juicer.ops.mapper.calibrate_response_mapper

DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【回答】进行校准,\ 使其更加详细、准确,且仍可以回答原问题。只输出校准后的回答,不要输出多余内容。' -
[docs] def parse_output(self, raw_output): - return None, raw_output.strip()
+
+[docs] + def parse_output(self, raw_output): + return None, raw_output.strip()
+
+
diff --git a/_modules/data_juicer/ops/mapper/chinese_convert_mapper.html b/_modules/data_juicer/ops/mapper/chinese_convert_mapper.html index 689d6af55..b97137b62 100644 --- a/_modules/data_juicer/ops/mapper/chinese_convert_mapper.html +++ b/_modules/data_juicer/ops/mapper/chinese_convert_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.chinese_convert_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.chinese_convert_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -100,14 +100,18 @@

Source code for data_juicer.ops.mapper.chinese_convert_mapper

OPENCC_CONVERTER = opencc.OpenCC(mode_path) -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class ChineseConvertMapper(Mapper): """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.""" _batched_op = True -
[docs] def __init__(self, mode: str = 's2t', *args, **kwargs): +
+[docs] + def __init__(self, mode: str = 's2t', *args, **kwargs): """ Initialization method. @@ -162,13 +166,18 @@

Source code for data_juicer.ops.mapper.chinese_convert_mapper

self.mode = mode prepare_converter(self.mode)
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): prepare_converter(self.mode) samples[self.text_key] = [ OPENCC_CONVERTER.convert(text) for text in samples[self.text_key] ] - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html b/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html index bca25e3c7..4ec423c9b 100644 --- a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.clean_copyright_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.clean_copyright_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -86,14 +86,18 @@

Source code for data_juicer.ops.mapper.clean_copyright_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('clean_copyright_mapper') +
+[docs] +@OPERATORS.register_module('clean_copyright_mapper') class CleanCopyrightMapper(Mapper): """Mapper to clean copyright comments at the beginning of the text samples.""" _batched_op = True -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Initialization method. @@ -104,6 +108,7 @@

Source code for data_juicer.ops.mapper.clean_copyright_mapper

self.pat = re.compile('/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/') self.cpat = re.compile('copyright', re.IGNORECASE)
+ def _process_single_sample(self, sample): r = self.pat.search(sample) if r: @@ -133,12 +138,16 @@

Source code for data_juicer.ops.mapper.clean_copyright_mapper

sample = '\n'.join(lines[skip:]) return sample -
[docs] def process_batched(self, samples): +
+[docs] + def process_batched(self, samples): samples[self.text_key] = [ self._process_single_sample(text) for text in samples[self.text_key] ] - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/clean_email_mapper.html b/_modules/data_juicer/ops/mapper/clean_email_mapper.html index f2fe04dec..9535287a8 100644 --- a/_modules/data_juicer/ops/mapper/clean_email_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_email_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.clean_email_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.clean_email_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,13 +84,17 @@

Source code for data_juicer.ops.mapper.clean_email_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('clean_email_mapper') +
+[docs] +@OPERATORS.register_module('clean_email_mapper') class CleanEmailMapper(Mapper): """Mapper to clean email in text samples.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, pattern: Optional[str] = None, repl: str = '', *args, @@ -115,7 +119,10 @@

Source code for data_juicer.ops.mapper.clean_email_mapper

self.repl = repl
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): continue @@ -124,7 +131,9 @@

Source code for data_juicer.ops.mapper.clean_email_mapper

string=text, flags=re.DOTALL) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/clean_html_mapper.html b/_modules/data_juicer/ops/mapper/clean_html_mapper.html index a6ec200c8..8615d2bad 100644 --- a/_modules/data_juicer/ops/mapper/clean_html_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_html_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.clean_html_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.clean_html_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -90,13 +90,17 @@

Source code for data_juicer.ops.mapper.clean_html_mapper

OP_NAME = 'clean_html_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class CleanHtmlMapper(Mapper): """Mapper to clean html code in text samples.""" _batched_op = True -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Initialization method. @@ -105,7 +109,10 @@

Source code for data_juicer.ops.mapper.clean_html_mapper

""" super().__init__(*args, **kwargs)
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): def _clean_html(raw_html): raw_html = raw_html.replace('<li>', '\n*') @@ -118,7 +125,9 @@

Source code for data_juicer.ops.mapper.clean_html_mapper

samples[self.text_key] = [ _clean_html(text) for text in samples[self.text_key] ] - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html b/_modules/data_juicer/ops/mapper/clean_ip_mapper.html index 19a54370f..edb03a1b0 100644 --- a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_ip_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.clean_ip_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.clean_ip_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,13 +84,17 @@

Source code for data_juicer.ops.mapper.clean_ip_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('clean_ip_mapper') +
+[docs] +@OPERATORS.register_module('clean_ip_mapper') class CleanIpMapper(Mapper): """Mapper to clean ipv4 and ipv6 address in text samples.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, pattern: Optional[str] = None, repl: str = '', *args, @@ -119,7 +123,10 @@

Source code for data_juicer.ops.mapper.clean_ip_mapper

self.pattern = pattern[2:-1] self.repl = repl
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): continue @@ -127,7 +134,9 @@

Source code for data_juicer.ops.mapper.clean_ip_mapper

repl=self.repl, string=text, flags=re.DOTALL) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/clean_links_mapper.html b/_modules/data_juicer/ops/mapper/clean_links_mapper.html index c856dfcd9..6135409ca 100644 --- a/_modules/data_juicer/ops/mapper/clean_links_mapper.html +++ b/_modules/data_juicer/ops/mapper/clean_links_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.clean_links_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.clean_links_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -87,13 +87,17 @@

Source code for data_juicer.ops.mapper.clean_links_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('clean_links_mapper') +
+[docs] +@OPERATORS.register_module('clean_links_mapper') class CleanLinksMapper(Mapper): """Mapper to clean links like http/https/ftp in text samples.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, pattern: Optional[str] = None, repl: str = '', *args, @@ -125,7 +129,10 @@

Source code for data_juicer.ops.mapper.clean_links_mapper

self.pattern = pattern[2:-1] self.repl = repl
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): continue @@ -134,7 +141,9 @@

Source code for data_juicer.ops.mapper.clean_links_mapper

repl=self.repl, string=text, flags=re.DOTALL) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html b/_modules/data_juicer/ops/mapper/expand_macro_mapper.html index a1e2dc842..e64b19bcb 100644 --- a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html +++ b/_modules/data_juicer/ops/mapper/expand_macro_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.expand_macro_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.expand_macro_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -86,14 +86,18 @@

Source code for data_juicer.ops.mapper.expand_macro_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('expand_macro_mapper') +
+[docs] +@OPERATORS.register_module('expand_macro_mapper') class ExpandMacroMapper(Mapper): """Mapper to expand macro definitions in the document body of Latex samples.""" _batched_op = True -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Initialization method. @@ -102,6 +106,7 @@

Source code for data_juicer.ops.mapper.expand_macro_mapper

""" super().__init__(*args, **kwargs)
+ def _build_non_arg_macros_dict(self, file_content): # regex for extracting \newcommand macros without arguments non_arg_nc_reg = re.compile( @@ -136,7 +141,9 @@

Source code for data_juicer.ops.mapper.expand_macro_mapper

macros[macro_name] = macro_val return macros -
[docs] def process_batched(self, samples): +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): non_arg_macros = self._build_non_arg_macros_dict(text) @@ -161,7 +168,9 @@

Source code for data_juicer.ops.mapper.expand_macro_mapper

samples[self.text_key][idx] = text - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/extract_entity_attribute_mapper.html b/_modules/data_juicer/ops/mapper/extract_entity_attribute_mapper.html index e29406b03..0e519aae6 100644 --- a/_modules/data_juicer/ops/mapper/extract_entity_attribute_mapper.html +++ b/_modules/data_juicer/ops/mapper/extract_entity_attribute_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.extract_entity_attribute_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.extract_entity_attribute_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -92,7 +92,9 @@

Source code for data_juicer.ops.mapper.extract_entity_attribute_mapper

< # TODO: LLM-based inference. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class ExtractEntityAttributeMapper(Mapper): """ Extract attributes for given entities from the text @@ -117,7 +119,9 @@

Source code for data_juicer.ops.mapper.extract_entity_attribute_mapper

< DEFAULT_ATTR_PATTERN_TEMPLATE = r'\#\#\s*{attribute}:\s*(.*?)(?=\#\#\#|\Z)' DEFAULT_DEMON_PATTERN = r'\#\#\#\s*代表性示例(\d+):\s*(.*?)(?=\#\#\#|\Z)' -
[docs] def __init__(self, +
+[docs] + def __init__(self, query_entities: List[str] = [], query_attributes: List[str] = [], api_model: str = 'gpt-4o', @@ -197,7 +201,10 @@

Source code for data_juicer.ops.mapper.extract_entity_attribute_mapper

< self.try_num = try_num self.drop_text = drop_text
-
[docs] def parse_output(self, raw_output, attribute_name): + +
+[docs] + def parse_output(self, raw_output, attribute_name): attribute_pattern = self.attr_pattern_template.format( attribute=attribute_name) @@ -214,6 +221,7 @@

Source code for data_juicer.ops.mapper.extract_entity_attribute_mapper

< return attribute, demos
+ def _process_single_sample(self, text='', rank=None): client = get_model(self.model_key, rank=rank) @@ -247,7 +255,9 @@

Source code for data_juicer.ops.mapper.extract_entity_attribute_mapper

< return entities, attributes, descs, demo_lists -
[docs] def process_batched(self, samples, rank=None): +
+[docs] + def process_batched(self, samples, rank=None): sample_num = len(samples[self.text_key]) @@ -274,7 +284,9 @@

Source code for data_juicer.ops.mapper.extract_entity_attribute_mapper

< for key in samples: samples[key] = list(chain(*samples[key])) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/extract_entity_relation_mapper.html b/_modules/data_juicer/ops/mapper/extract_entity_relation_mapper.html index 8a85acb69..0101af7db 100644 --- a/_modules/data_juicer/ops/mapper/extract_entity_relation_mapper.html +++ b/_modules/data_juicer/ops/mapper/extract_entity_relation_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.extract_entity_relation_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.extract_entity_relation_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -99,7 +99,9 @@

Source code for data_juicer.ops.mapper.extract_entity_relation_mapper

# TODO: LLM-based inference. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class ExtractEntityRelationMapper(Mapper): """ Extract entities and relations in the text for knowledge graph. @@ -224,7 +226,9 @@

Source code for data_juicer.ops.mapper.extract_entity_relation_mapper

DEFAULT_ENTITY_PATTERN = r'\("entity"(.*?)\)' DEFAULT_RELATION_PATTERN = r'\("relationship"(.*?)\)' -
[docs] def __init__(self, +
+[docs] + def __init__(self, api_model: str = 'gpt-4o', entity_types: List[str] = None, *, @@ -307,7 +311,10 @@

Source code for data_juicer.ops.mapper.extract_entity_relation_mapper

self.try_num = try_num self.drop_text = drop_text
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): entities, relations = [], [] def remove_outer_quotes(text): @@ -359,10 +366,16 @@

Source code for data_juicer.ops.mapper.extract_entity_relation_mapper

return entities, relations
-
[docs] def add_message(self, messages, role, content): + +
+[docs] + def add_message(self, messages, role, content): return messages + [{'role': role, 'content': content}]
-
[docs] def light_rag_extraction(self, messages, rank=None): + +
+[docs] + def light_rag_extraction(self, messages, rank=None): client = get_model(self.model_key, rank=rank) final_result = client(messages, **self.sampling_params) @@ -386,7 +399,10 @@

Source code for data_juicer.ops.mapper.extract_entity_relation_mapper

return final_result
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): input_prompt = self.prompt_template.format( tuple_delimiter=self.tuple_delimiter, @@ -408,7 +424,9 @@

Source code for data_juicer.ops.mapper.extract_entity_relation_mapper

sample[self.entity_key] = entities sample[self.relation_key] = relations - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/extract_event_mapper.html b/_modules/data_juicer/ops/mapper/extract_event_mapper.html index 77ebb49b1..03394919a 100644 --- a/_modules/data_juicer/ops/mapper/extract_event_mapper.html +++ b/_modules/data_juicer/ops/mapper/extract_event_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.extract_event_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.extract_event_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -94,7 +94,9 @@

Source code for data_juicer.ops.mapper.extract_event_mapper

# TODO: LLM-based inference. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class ExtractEventMapper(Mapper): """ Extract events and relevant characters in the text @@ -128,7 +130,9 @@

Source code for data_juicer.ops.mapper.extract_event_mapper

-\s*\*\*相关人物\*\*\s*:\s*(.*?)(?=\#\#\#|\Z) """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, api_model: str = 'gpt-4o', *, event_desc_key: str = Fields.event_description, @@ -184,7 +188,10 @@

Source code for data_juicer.ops.mapper.extract_event_mapper

self.try_num = try_num self.drop_text = drop_text
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): pattern = re.compile(self.output_pattern, re.VERBOSE | re.DOTALL) matches = pattern.findall(raw_output) @@ -199,6 +206,7 @@

Source code for data_juicer.ops.mapper.extract_event_mapper

return event_list, character_list
+ def _process_single_sample(self, text='', rank=None): client = get_model(self.model_key, rank=rank) @@ -223,7 +231,9 @@

Source code for data_juicer.ops.mapper.extract_event_mapper

return event_list, character_list -
[docs] def process_batched(self, samples, rank=None): +
+[docs] + def process_batched(self, samples, rank=None): sample_num = len(samples[self.text_key]) @@ -246,7 +256,9 @@

Source code for data_juicer.ops.mapper.extract_event_mapper

for key in samples: samples[key] = list(chain(*samples[key])) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/extract_keyword_mapper.html b/_modules/data_juicer/ops/mapper/extract_keyword_mapper.html index aae41116d..b7f5c1e1a 100644 --- a/_modules/data_juicer/ops/mapper/extract_keyword_mapper.html +++ b/_modules/data_juicer/ops/mapper/extract_keyword_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.extract_keyword_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.extract_keyword_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -95,7 +95,9 @@

Source code for data_juicer.ops.mapper.extract_keyword_mapper

# TODO: LLM-based inference. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class ExtractKeywordMapper(Mapper): """ Generate keywords for the text @@ -178,7 +180,9 @@

Source code for data_juicer.ops.mapper.extract_keyword_mapper

DEFAULT_COMPLETION_DELIMITER = '<|COMPLETE|>' DEFAULT_OUTPUT_PATTERN = r'\("content_keywords"(.*?)\)' -
[docs] def __init__(self, +
+[docs] + def __init__(self, api_model: str = 'gpt-4o', *, keyword_key: str = Fields.keyword, @@ -230,7 +234,10 @@

Source code for data_juicer.ops.mapper.extract_keyword_mapper

self.try_num = try_num self.drop_text = drop_text
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): keywords = [] output_pattern = re.compile(self.output_pattern, @@ -242,7 +249,10 @@

Source code for data_juicer.ops.mapper.extract_keyword_mapper

return keywords
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): client = get_model(self.model_key, rank=rank) input_prompt = self.prompt_template.format( @@ -264,7 +274,9 @@

Source code for data_juicer.ops.mapper.extract_keyword_mapper

if self.drop_text: sample.pop(self.text_key) - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/extract_nickname_mapper.html b/_modules/data_juicer/ops/mapper/extract_nickname_mapper.html index b8f9cfea0..a6d8f847e 100644 --- a/_modules/data_juicer/ops/mapper/extract_nickname_mapper.html +++ b/_modules/data_juicer/ops/mapper/extract_nickname_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.extract_nickname_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.extract_nickname_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -91,7 +91,9 @@

Source code for data_juicer.ops.mapper.extract_nickname_mapper

# TODO: LLM-based inference. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class ExtractNicknameMapper(Mapper): """ Extract nickname relationship in the text. @@ -126,7 +128,9 @@

Source code for data_juicer.ops.mapper.extract_nickname_mapper

-\s*\*\*(.*?)对(.*?)的昵称\*\*\s*:\s*(.*?)(?=\#\#\#|\Z) # for double check """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, api_model: str = 'gpt-4o', *, nickname_key: str = Fields.nickname, @@ -177,7 +181,10 @@

Source code for data_juicer.ops.mapper.extract_nickname_mapper

self.try_num = try_num self.drop_text = drop_text
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): pattern = re.compile(self.output_pattern, re.VERBOSE | re.DOTALL) matches = pattern.findall(raw_output) @@ -209,7 +216,10 @@

Source code for data_juicer.ops.mapper.extract_nickname_mapper

return nickname_relations
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): client = get_model(self.model_key, rank=rank) input_prompt = self.input_template.format(text=sample[self.text_key]) @@ -234,7 +244,9 @@

Source code for data_juicer.ops.mapper.extract_nickname_mapper

if self.drop_text: sample.pop(self.text_key) - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html b/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html index 3dd9656fd..2d606a405 100644 --- a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html +++ b/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.fix_unicode_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.fix_unicode_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -86,13 +86,17 @@

Source code for data_juicer.ops.mapper.fix_unicode_mapper

OP_NAME = 'fix_unicode_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class FixUnicodeMapper(Mapper): """Mapper to fix unicode errors in text samples.""" _batched_op = True -
[docs] def __init__(self, normalization: str = None, *args, **kwargs): +
+[docs] + def __init__(self, normalization: str = None, *args, **kwargs): """ Initialization method. @@ -113,12 +117,17 @@

Source code for data_juicer.ops.mapper.fix_unicode_mapper

'supported. Can only be one of ' '["NFC", "NFKC", "NFD", "NFKD"]')
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): samples[self.text_key] = [ ftfy.fix_text(text, normalization=self.normalization) for text in samples[self.text_key] ] - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html index c90126a3d..421529a4d 100644 --- a/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html +++ b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.generate_qa_from_examples_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.generate_qa_from_examples_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -98,7 +98,9 @@

Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

# TODO: Extend LLM-based OPs into API-based implementation. -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class GenerateQAFromExamplesMapper(Mapper): """ @@ -129,7 +131,9 @@

Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', @@ -223,6 +227,7 @@

Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

if len(self.seed_qa_samples) == 0: raise ValueError('No QA data was parsed from the seed file!')
+ def _load_seed_qa_samples(self): """Load QA pairs from chatml format file.""" qa_samples = [] @@ -265,7 +270,9 @@

Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

qa_pairs.append((user_input, assistant_output)) return qa_pairs -
[docs] def build_input(self, qa_examples): +
+[docs] + def build_input(self, qa_examples): def format_qa_pairs(qa_example): return ''.join([ @@ -280,7 +287,10 @@

Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

input_prompt = self.input_template.format(examples=formatted_examples) return input_prompt
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): logger.debug(raw_output) output_qa_pairs = [] matches = re.findall(self.output_pattern, raw_output, re.DOTALL) @@ -289,7 +299,10 @@

Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

output_qa_pairs.append((question.strip(), answer.strip())) return output_qa_pairs
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): model, _ = get_model(self.model_key, rank, self.use_cuda()) random_qa_samples = random.sample(self.seed_qa_samples, @@ -347,7 +360,9 @@

Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

self.response_key: response, self.history_key: history }) - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html index 4ded6cdec..cce073917 100644 --- a/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html +++ b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.generate_qa_from_text_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.generate_qa_from_text_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -93,7 +93,9 @@

Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper

# TODO: Extend LLM-based OPs into API-based implementation. -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class GenerateQAFromTextMapper(Mapper): """ @@ -113,7 +115,9 @@

Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper

_accelerator = 'cuda' _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: Optional[str] = None, @@ -180,7 +184,10 @@

Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper

**model_params) self.sampling_params = sampling_params
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): logger.debug(raw_output) qa_list = [] matches = re.findall(self.output_pattern, raw_output, re.DOTALL) @@ -189,7 +196,10 @@

Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper

qa_list.append((user.strip(), assistant.strip())) return qa_list
-
[docs] def process_batched(self, samples, rank=None): + +
+[docs] + def process_batched(self, samples, rank=None): model, _ = get_model(self.model_key, rank, self.use_cuda()) input_keys = samples.keys() @@ -222,7 +232,9 @@

Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper

'No question and answer was extracted from current sample!' ) - return output_samples
+ return output_samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/image_blur_mapper.html b/_modules/data_juicer/ops/mapper/image_blur_mapper.html index 5f1ce81ff..f4cbd4105 100644 --- a/_modules/data_juicer/ops/mapper/image_blur_mapper.html +++ b/_modules/data_juicer/ops/mapper/image_blur_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.image_blur_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.image_blur_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -91,13 +91,17 @@

Source code for data_juicer.ops.mapper.image_blur_mapper

OP_NAME = 'image_blur_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageBlurMapper(Mapper): """Mapper to blur images. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, @@ -132,7 +136,10 @@

Source code for data_juicer.ops.mapper.image_blur_mapper

else: self.blur = ImageFilter.GaussianBlur(radius)
-
[docs] def process_single(self, sample, context=False): + +
+[docs] + def process_single(self, sample, context=False): # there is no image in this sample if self.image_key not in sample or not sample[self.image_key]: sample[Fields.source_file] = [] @@ -172,7 +179,9 @@

Source code for data_juicer.ops.mapper.image_blur_mapper

sample[Fields.source_file][i] = value sample[self.image_key] = [processed[key] for key in loaded_image_keys] - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.html b/_modules/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.html index d4ab91997..95a7a6d06 100644 --- a/_modules/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.html +++ b/_modules/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -173,7 +173,9 @@

Source code for data_juicer.ops.mapper.image_captioning_from_gpt4v_mapperreturn None -
[docs]@OPERATORS.register_module('image_captioning_from_gpt4v_mapper') +
+[docs] +@OPERATORS.register_module('image_captioning_from_gpt4v_mapper') @LOADED_IMAGES.register_module('image_captioning_from_gpt4v_mapper') class ImageCaptioningFromGPT4VMapper(Mapper): """Mapper to generate samples whose texts are generated based on @@ -181,7 +183,9 @@

Source code for data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, mode: str = 'description', api_key: str = '', max_token: int = 500, @@ -259,6 +263,7 @@

Source code for data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper'Both the parameter `user_prompt` and `user_prompt_key` are ' 'set. Data-Juicer will consider `user_prompt_key` first.')

+ def _process_single_sample(self, sample): # there is no image in this sample if self.image_key not in sample or not sample[self.image_key]: @@ -327,7 +332,9 @@

Source code for data_juicer.ops.mapper.image_captioning_from_gpt4v_mapperreturn [generated_sample] -
[docs] def process_batched(self, samples): +
+[docs] + def process_batched(self, samples): # reconstruct samples from "dict of lists" to "list of dicts" reconstructed_samples = [] for i in range(len(samples[self.text_key])): @@ -348,7 +355,9 @@

Source code for data_juicer.ops.mapper.image_captioning_from_gpt4v_mapperfor key in keys: res_samples[key] = [s[key] for s in samples_after_generation] - return res_samples

+ return res_samples

+

+

diff --git a/_modules/data_juicer/ops/mapper/image_captioning_mapper.html b/_modules/data_juicer/ops/mapper/image_captioning_mapper.html index b1e575a93..e8bdc2ba5 100644 --- a/_modules/data_juicer/ops/mapper/image_captioning_mapper.html +++ b/_modules/data_juicer/ops/mapper/image_captioning_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.image_captioning_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.image_captioning_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -101,7 +101,9 @@

Source code for data_juicer.ops.mapper.image_captioning_mapper

OP_NAME = 'image_captioning_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageCaptioningMapper(Mapper): """Mapper to generate samples whose captions are generated based on @@ -110,7 +112,9 @@

Source code for data_juicer.ops.mapper.image_captioning_mapper

_accelerator = 'cuda' _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: PositiveInt = 1, @@ -193,6 +197,7 @@

Source code for data_juicer.ops.mapper.image_captioning_mapper

'Both the parameter `prompt` and `prompt_key` are ' 'set. Data-Juicer will consider `prompt_key` first.')
+ def _process_single_sample(self, ori_sample, rank=None): """ @@ -348,7 +353,9 @@

Source code for data_juicer.ops.mapper.image_captioning_mapper

generated_text_candidates_single_chunk[max_index]) return new_generated_text_per_chunk -
[docs] def process_batched(self, samples, rank=None): +
+[docs] + def process_batched(self, samples, rank=None): """ Note: This is a batched_OP, whose input and output type are @@ -382,7 +389,9 @@

Source code for data_juicer.ops.mapper.image_captioning_mapper

for key in keys: res_samples[key] = [s[key] for s in samples_after_generation] - return res_samples
+ return res_samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/image_diffusion_mapper.html b/_modules/data_juicer/ops/mapper/image_diffusion_mapper.html index 571165e2e..8de67b0c8 100644 --- a/_modules/data_juicer/ops/mapper/image_diffusion_mapper.html +++ b/_modules/data_juicer/ops/mapper/image_diffusion_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.image_diffusion_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.image_diffusion_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -97,7 +97,9 @@

Source code for data_juicer.ops.mapper.image_diffusion_mapper

OP_NAME = 'image_diffusion_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageDiffusionMapper(Mapper): """ @@ -107,7 +109,9 @@

Source code for data_juicer.ops.mapper.image_diffusion_mapper

_accelerator = 'cuda' _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', @@ -192,6 +196,7 @@

Source code for data_juicer.ops.mapper.image_diffusion_mapper

revision=revision, trust_remote_code=trust_remote_code)
+ def _real_guidance(self, caption: str, image: Image.Image, rank=None): canvas = image.resize((512, 512), Image.BILINEAR) @@ -284,7 +289,9 @@

Source code for data_juicer.ops.mapper.image_diffusion_mapper

return generated_samples -
[docs] def process_batched(self, samples, rank=None, context=False): +
+[docs] + def process_batched(self, samples, rank=None, context=False): """ Note: This is a batched_OP, whose the input and output type are @@ -318,7 +325,9 @@

Source code for data_juicer.ops.mapper.image_diffusion_mapper

for key in keys: res_samples[key] = [s[key] for s in samples_after_generation] - return res_samples
+ return res_samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/image_face_blur_mapper.html b/_modules/data_juicer/ops/mapper/image_face_blur_mapper.html index 944477c5e..33186c25f 100644 --- a/_modules/data_juicer/ops/mapper/image_face_blur_mapper.html +++ b/_modules/data_juicer/ops/mapper/image_face_blur_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.image_face_blur_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.image_face_blur_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -98,7 +98,9 @@

Source code for data_juicer.ops.mapper.image_face_blur_mapper

OP_NAME = 'image_face_blur_mapper' -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageFaceBlurMapper(Mapper): @@ -112,7 +114,9 @@

Source code for data_juicer.ops.mapper.image_face_blur_mapper

'maxSize': None, } -
[docs] def __init__(self, +
+[docs] + def __init__(self, cv_classifier: str = '', blur_type: str = 'gaussian', radius: NonNegativeFloat = 2, @@ -160,7 +164,10 @@

Source code for data_juicer.ops.mapper.image_face_blur_mapper

self.model_key = prepare_model(model_type='opencv_classifier', model_path=cv_classifier)
-
[docs] def process_single(self, sample, context=False): + +
+[docs] + def process_single(self, sample, context=False): # there is no image in this sample if self.image_key not in sample or not sample[self.image_key]: sample[Fields.source_file] = [] @@ -212,7 +219,9 @@

Source code for data_juicer.ops.mapper.image_face_blur_mapper

sample[self.image_key] = [ key_mapping[key] for key in loaded_image_keys ] - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/image_tagging_mapper.html b/_modules/data_juicer/ops/mapper/image_tagging_mapper.html index 3b4133cda..d4079239f 100644 --- a/_modules/data_juicer/ops/mapper/image_tagging_mapper.html +++ b/_modules/data_juicer/ops/mapper/image_tagging_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.image_tagging_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.image_tagging_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -95,7 +95,9 @@

Source code for data_juicer.ops.mapper.image_tagging_mapper

OP_NAME = 'image_tagging_mapper' -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) class ImageTaggingMapper(Mapper): @@ -104,7 +106,9 @@

Source code for data_juicer.ops.mapper.image_tagging_mapper

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, tag_field_name: str = Fields.image_tags, *args, **kwargs): @@ -123,7 +127,10 @@

Source code for data_juicer.ops.mapper.image_tagging_mapper

self.transform = ram.get_transform(image_size=384) self.tag_field_name = tag_field_name
-
[docs] def process_single(self, sample, rank=None, context=False): + +
+[docs] + def process_single(self, sample, rank=None, context=False): # check if it's generated already if self.tag_field_name in sample: return sample @@ -154,7 +161,9 @@

Source code for data_juicer.ops.mapper.image_tagging_mapper

image_tags.append(np.array(sorted_word_list, dtype=np.str_)) sample[self.tag_field_name] = image_tags - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html b/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html index 23db31107..ea36f1116 100644 --- a/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html +++ b/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.nlpaug_en_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.nlpaug_en_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -94,13 +94,17 @@

Source code for data_juicer.ops.mapper.nlpaug_en_mapper

OP_NAME = 'nlpaug_en_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class NlpaugEnMapper(Mapper): """Mapper to simply augment samples in English based on nlpaug library.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, sequential: bool = False, aug_num: PositiveInt = 1, keep_original_sample: bool = True, @@ -203,7 +207,10 @@

Source code for data_juicer.ops.mapper.nlpaug_en_mapper

else: self.aug = aug_pipeline
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): # no augmentation methods are opened if len(self.aug) == 0: if self.keep_original_sample: @@ -233,7 +240,9 @@

Source code for data_juicer.ops.mapper.nlpaug_en_mapper

if key != self.text_key: res_samples[key] = res_samples[key] * \ len(res_samples[self.text_key]) - return res_samples
+ return res_samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html b/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html index fb3300c77..cbefd91fb 100644 --- a/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html +++ b/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.nlpcda_zh_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.nlpcda_zh_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -92,13 +92,17 @@

Source code for data_juicer.ops.mapper.nlpcda_zh_mapper

OP_NAME = 'nlpcda_zh_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class NlpcdaZhMapper(Mapper): """Mapper to simply augment samples in Chinese based on nlpcda library.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, sequential: bool = False, aug_num: PositiveInt = 1, keep_original_sample: bool = True, @@ -208,7 +212,10 @@

Source code for data_juicer.ops.mapper.nlpcda_zh_mapper

self.aug_pipeline.append( nlpcda.EquivalentChar(create_num=create_num))
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): # no augmentation methods are opened if len(self.aug_pipeline) == 0: if self.keep_original_sample: @@ -247,7 +254,9 @@

Source code for data_juicer.ops.mapper.nlpcda_zh_mapper

if key != self.text_key: res_samples[key] = res_samples[key] * \ len(res_samples[self.text_key]) - return res_samples
+ return res_samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html index 4496090b7..95677a4f4 100644 --- a/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html +++ b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.optimize_qa_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.optimize_qa_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -93,7 +93,9 @@

Source code for data_juicer.ops.mapper.optimize_qa_mapper

# TODO: Extend LLM-based OPs into API-based implementation. -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class OptimizeQAMapper(Mapper): """ @@ -113,7 +115,9 @@

Source code for data_juicer.ops.mapper.optimize_qa_mapper

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: Optional[str] = None, @@ -178,13 +182,19 @@

Source code for data_juicer.ops.mapper.optimize_qa_mapper

**model_params) self.sampling_params = sampling_params
-
[docs] def build_input(self, sample): + +
+[docs] + def build_input(self, sample): qa_pair = self.qa_pair_template.format(sample[self.query_key], sample[self.response_key]) input_prompt = self.input_template.format(qa_pair) return input_prompt
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): logger.debug(raw_output) match = re.match(self.output_pattern, raw_output, re.DOTALL) if match: @@ -192,7 +202,10 @@

Source code for data_juicer.ops.mapper.optimize_qa_mapper

else: return None, None
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): model, _ = get_model(self.model_key, rank, self.use_cuda()) input_prompt = self.build_input(sample) @@ -220,7 +233,9 @@

Source code for data_juicer.ops.mapper.optimize_qa_mapper

if parsed_a: sample[self.response_key] = parsed_a - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/optimize_query_mapper.html b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html index bc07e4660..2d1eecfad 100644 --- a/_modules/data_juicer/ops/mapper/optimize_query_mapper.html +++ b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.optimize_query_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.optimize_query_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,7 +84,9 @@

Source code for data_juicer.ops.mapper.optimize_query_mapper

# TODO: Extend LLM-based OPs into API-based implementation. -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class OptimizeQueryMapper(OptimizeQAMapper): """ @@ -95,8 +97,12 @@

Source code for data_juicer.ops.mapper.optimize_query_mapper

_accelerator = 'cuda' -
[docs] def parse_output(self, raw_output): - return raw_output.strip(), None
+
+[docs] + def parse_output(self, raw_output): + return raw_output.strip(), None
+
+
diff --git a/_modules/data_juicer/ops/mapper/optimize_response_mapper.html b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html index 2c9452137..f6d88fe51 100644 --- a/_modules/data_juicer/ops/mapper/optimize_response_mapper.html +++ b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.optimize_response_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.optimize_response_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,7 +84,9 @@

Source code for data_juicer.ops.mapper.optimize_response_mapper

# TODO: Extend LLM-based OPs into API-based implementation. -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) class OptimizeResponseMapper(OptimizeQAMapper): """ @@ -95,8 +97,12 @@

Source code for data_juicer.ops.mapper.optimize_response_mapper

_accelerator = 'cuda' -
[docs] def parse_output(self, raw_output): - return None, raw_output.strip()
+
+[docs] + def parse_output(self, raw_output): + return None, raw_output.strip()
+
+
diff --git a/_modules/data_juicer/ops/mapper/pair_preference_mapper.html b/_modules/data_juicer/ops/mapper/pair_preference_mapper.html index cb6ab65a5..15708965a 100644 --- a/_modules/data_juicer/ops/mapper/pair_preference_mapper.html +++ b/_modules/data_juicer/ops/mapper/pair_preference_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.pair_preference_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.pair_preference_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -90,7 +90,9 @@

Source code for data_juicer.ops.mapper.pair_preference_mapper

# TODO: Extend LLM-based OPs into API-based implementation. -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class PairPreferenceMapper(Mapper): """ Mapper to construct paired preference samples. @@ -114,7 +116,9 @@

Source code for data_juicer.ops.mapper.pair_preference_mapper

'{response}') DEFAULT_OUTPUT_PATTERN = r'.*?【回答】\s*(.*?)\s*【原因】\s*(.*)' -
[docs] def __init__(self, +
+[docs] + def __init__(self, api_model: str = 'gpt-4o', *, api_endpoint: Optional[str] = None, @@ -168,7 +172,10 @@

Source code for data_juicer.ops.mapper.pair_preference_mapper

self.try_num = try_num self.sampling_params = sampling_params
-
[docs] def build_input(self, sample): + +
+[docs] + def build_input(self, sample): mapping = { 'query': sample[self.query_key], 'response': sample[self.response_key], @@ -176,7 +183,10 @@

Source code for data_juicer.ops.mapper.pair_preference_mapper

} return self.input_template.format_map(mapping)
-
[docs] def parse_output(self, raw_output): + +
+[docs] + def parse_output(self, raw_output): logger.debug(raw_output) match = re.match(self.output_pattern, raw_output, re.DOTALL) if match: @@ -184,7 +194,10 @@

Source code for data_juicer.ops.mapper.pair_preference_mapper

else: return ('', '')
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): client = get_model(self.model_key, rank=rank) messages = [{ @@ -207,7 +220,9 @@

Source code for data_juicer.ops.mapper.pair_preference_mapper

sample[self.rejected_key] = parsed_rejected sample[self.reason_key] = parsed_reason - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html b/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html index f136424e4..2d4448d3c 100644 --- a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html +++ b/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.punctuation_normalization_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.punctuation_normalization_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,14 +84,18 @@

Source code for data_juicer.ops.mapper.punctuation_normalization_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('punctuation_normalization_mapper') +
+[docs] +@OPERATORS.register_module('punctuation_normalization_mapper') class PunctuationNormalizationMapper(Mapper): """Mapper to normalize unicode punctuations to English punctuations in text samples.""" _batched_op = True -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Initialization method. @@ -136,12 +140,17 @@

Source code for data_juicer.ops.mapper.punctuation_normalization_mapper

'►': '-', }
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): samples[self.text_key] = [ ''.join([self.punctuation_unicode.get(c, c) for c in text]) for text in samples[self.text_key] ] - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/python_file_mapper.html b/_modules/data_juicer/ops/mapper/python_file_mapper.html new file mode 100644 index 000000000..ba005ce2a --- /dev/null +++ b/_modules/data_juicer/ops/mapper/python_file_mapper.html @@ -0,0 +1,218 @@ + + + + + + + + data_juicer.ops.mapper.python_file_mapper — data_juicer 1.0.1 documentation + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.ops.mapper.python_file_mapper

+import importlib.util
+import inspect
+import os
+
+from ..base_op import OPERATORS, Mapper
+
+OP_NAME = 'python_file_mapper'
+
+
+
+[docs] +@OPERATORS.register_module(OP_NAME) +class PythonFileMapper(Mapper): + """Mapper for executing Python function defined in a file.""" + +
+[docs] + def __init__(self, + file_path: str = '', + function_name: str = 'process_single', + batched: bool = False, + **kwargs): + """ + Initialization method. + + :param file_path: The path to the Python file containing the function + to be executed. + :param function_name: The name of the function defined in the file + to be executed. + :param batched: A boolean indicating whether to process input data in + batches. + :param kwargs: Additional keyword arguments passed to the parent class. + """ + self._batched_op = bool(batched) + super().__init__(**kwargs) + + self.file_path = file_path + self.function_name = function_name + if not file_path: + self.func = lambda sample: sample + else: + self.func = self._load_function()
+ + + def _load_function(self): + if not os.path.isfile(self.file_path): + raise FileNotFoundError( + f"The file '{self.file_path}' does not exist.") + + if not self.file_path.endswith('.py'): + raise ValueError( + f"The file '{self.file_path}' is not a Python file.") + + # Load the module from the file + module_name = os.path.splitext(os.path.basename(self.file_path))[0] + spec = importlib.util.spec_from_file_location(module_name, + self.file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Fetch the specified function from the module + if not hasattr(module, self.function_name): + raise ValueError( + f"Function '{self.function_name}' not found in '{self.file_path}'." # noqa: E501 + ) + + func = getattr(module, self.function_name) + + if not callable(func): + raise ValueError( + f"The attribute '{self.function_name}' is not callable.") + + # Check that the function has exactly one argument + argspec = inspect.getfullargspec(func) + if len(argspec.args) != 1: + raise ValueError( + f"The function '{self.function_name}' must take exactly one argument" # noqa: E501 + ) + + return func + +
+[docs] + def process_single(self, sample): + """Invoke the loaded function with the provided sample.""" + result = self.func(sample) + + if not isinstance(result, dict): + raise ValueError( + f'Function must return a dictionary, got {type(result).__name__} instead.' # noqa: E501 + ) + + return result
+ + +
+[docs] + def process_batched(self, samples): + """Invoke the loaded function with the provided samples.""" + result = self.func(samples) + + if not isinstance(result, dict): + raise ValueError( + f'Function must return a dictionary, got {type(result).__name__} instead.' # noqa: E501 + ) + + return result
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/python_lambda_mapper.html b/_modules/data_juicer/ops/mapper/python_lambda_mapper.html new file mode 100644 index 000000000..51f7efd8f --- /dev/null +++ b/_modules/data_juicer/ops/mapper/python_lambda_mapper.html @@ -0,0 +1,195 @@ + + + + + + + + data_juicer.ops.mapper.python_lambda_mapper — data_juicer 1.0.1 documentation + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.ops.mapper.python_lambda_mapper

+import ast
+
+from ..base_op import OPERATORS, Mapper
+
+OP_NAME = 'python_lambda_mapper'
+
+
+
+[docs] +@OPERATORS.register_module(OP_NAME) +class PythonLambdaMapper(Mapper): + """Mapper for executing Python lambda function on data samples.""" + +
+[docs] + def __init__(self, lambda_str: str = '', batched: bool = False, **kwargs): + """ + Initialization method. + + :param lambda_str: A string representation of the lambda function to be + executed on data samples. If empty, the identity function is used. + :param batched: A boolean indicating whether to process input data in + batches. + :param kwargs: Additional keyword arguments passed to the parent class. + """ + self._batched_op = bool(batched) + super().__init__(**kwargs) + + # Parse and validate the lambda function + if not lambda_str: + self.lambda_func = lambda sample: sample + else: + self.lambda_func = self._create_lambda(lambda_str)
+ + + def _create_lambda(self, lambda_str: str): + # Parse input string into an AST and check for a valid lambda function + try: + node = ast.parse(lambda_str, mode='eval') + + # Check if the body of the expression is a lambda + if not isinstance(node.body, ast.Lambda): + raise ValueError( + 'Input string must be a valid lambda function.') + + # Check that the lambda has exactly one argument + if len(node.body.args.args) != 1: + raise ValueError( + 'Lambda function must have exactly one argument.') + + # Compile the AST to code + compiled_code = compile(node, '<string>', 'eval') + # Safely evaluate the compiled code allowing built-in functions + func = eval(compiled_code, {'__builtins__': __builtins__}) + return func + except Exception as e: + raise ValueError(f'Invalid lambda function: {e}') + +
+[docs] + def process_single(self, sample): + # Process the input through the lambda function and return the result + result = self.lambda_func(sample) + + # Check if the result is a valid + if not isinstance(result, dict): + raise ValueError(f'Lambda function must return a dictionary, ' + f'got {type(result).__name__} instead.') + + return result
+ + +
+[docs] + def process_batched(self, samples): + # Process the input through the lambda function and return the result + result = self.lambda_func(samples) + + # Check if the result is a valid + if not isinstance(result, dict): + raise ValueError(f'Lambda function must return a dictionary, ' + f'got {type(result).__name__} instead.') + + return result
+
+ +
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html b/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html index 05924f279..494e90288 100644 --- a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_bibliography_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_bibliography_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -86,14 +86,18 @@

Source code for data_juicer.ops.mapper.remove_bibliography_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('remove_bibliography_mapper') +
+[docs] +@OPERATORS.register_module('remove_bibliography_mapper') class RemoveBibliographyMapper(Mapper): """Mapper to remove bibliography at the end of documents in Latex samples.""" _batched_op = True -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Initialization method. @@ -108,7 +112,10 @@

Source code for data_juicer.ops.mapper.remove_bibliography_mapper

self.pattern += r'\\bibliography\{.*\}' self.pattern += r').*$'
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): samples[self.text_key] = [ re.sub(pattern=self.pattern, repl=r'', @@ -116,7 +123,9 @@

Source code for data_juicer.ops.mapper.remove_bibliography_mapper

flags=re.DOTALL) for text in samples[self.text_key] ] - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html b/_modules/data_juicer/ops/mapper/remove_comments_mapper.html index 23f265b09..fda49dfc7 100644 --- a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_comments_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_comments_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_comments_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -88,7 +88,9 @@

Source code for data_juicer.ops.mapper.remove_comments_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('remove_comments_mapper') +
+[docs] +@OPERATORS.register_module('remove_comments_mapper') class RemoveCommentsMapper(Mapper): """ Mapper to remove comments in different kinds of documents. @@ -98,7 +100,9 @@

Source code for data_juicer.ops.mapper.remove_comments_mapper

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, doc_type: Union[str, List[str]] = 'tex', inline: bool = True, multiline: bool = True, @@ -118,7 +122,10 @@

Source code for data_juicer.ops.mapper.remove_comments_mapper

self.inline = inline self.multiline = multiline
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): # TODO: remove different comments by sample type for idx, text in enumerate(samples[self.text_key]): @@ -137,7 +144,9 @@

Source code for data_juicer.ops.mapper.remove_comments_mapper

samples[self.text_key][idx] = text - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/remove_header_mapper.html b/_modules/data_juicer/ops/mapper/remove_header_mapper.html index 26c11c4e6..e49f584d3 100644 --- a/_modules/data_juicer/ops/mapper/remove_header_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_header_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_header_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_header_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -86,14 +86,18 @@

Source code for data_juicer.ops.mapper.remove_header_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('remove_header_mapper') +
+[docs] +@OPERATORS.register_module('remove_header_mapper') class RemoveHeaderMapper(Mapper): """Mapper to remove headers at the beginning of documents in Latex samples.""" _batched_op = True -
[docs] def __init__(self, drop_no_head: bool = True, *args, **kwargs): +
+[docs] + def __init__(self, drop_no_head: bool = True, *args, **kwargs): """ Initialization method. @@ -115,7 +119,10 @@

Source code for data_juicer.ops.mapper.remove_header_mapper

self.drop_no_head = drop_no_head
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): if self.drop_no_head: @@ -128,7 +135,9 @@

Source code for data_juicer.ops.mapper.remove_header_mapper

samples[self.text_key][idx] = text - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html b/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html index 393870cee..46d453d0f 100644 --- a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_long_words_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_long_words_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -88,13 +88,17 @@

Source code for data_juicer.ops.mapper.remove_long_words_mapper

split_on_newline_tab_whitespace, strip) -
[docs]@OPERATORS.register_module('remove_long_words_mapper') +
+[docs] +@OPERATORS.register_module('remove_long_words_mapper') class RemoveLongWordsMapper(Mapper): """Mapper to remove long words within a specific range.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_len: int = 1, max_len: int = sys.maxsize, *args, @@ -113,7 +117,10 @@

Source code for data_juicer.ops.mapper.remove_long_words_mapper

self.min_len = min_len self.max_len = max_len
-
[docs] def should_keep_long_word(self, word): + +
+[docs] + def should_keep_long_word(self, word): if self.min_len <= len(word) <= self.max_len: return True elif self.min_len <= len(strip(word, @@ -122,7 +129,10 @@

Source code for data_juicer.ops.mapper.remove_long_words_mapper

else: return False
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): sentences = split_on_newline_tab_whitespace(text) sentences = [[[ @@ -131,7 +141,9 @@

Source code for data_juicer.ops.mapper.remove_long_words_mapper

] for subsentence in sentence] for sentence in sentences] samples[self.text_key][idx] = merge_on_whitespace_tab_newline( sentences) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/remove_non_chinese_character_mapper.html b/_modules/data_juicer/ops/mapper/remove_non_chinese_character_mapper.html index 4fddeaf77..e18d385ba 100644 --- a/_modules/data_juicer/ops/mapper/remove_non_chinese_character_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_non_chinese_character_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_non_chinese_character_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_non_chinese_character_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -82,13 +82,17 @@

Source code for data_juicer.ops.mapper.remove_non_chinese_character_mapperfrom ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('remove_non_chinese_character_mapper') +
+[docs] +@OPERATORS.register_module('remove_non_chinese_character_mapper') class RemoveNonChineseCharacterlMapper(Mapper): """Mapper to remove non chinese Character in text samples.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, @@ -114,7 +118,10 @@

Source code for data_juicer.ops.mapper.remove_non_chinese_character_mapperelse: self.pattern += u']'

-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if not re.search(self.pattern, text, flags=re.DOTALL): continue @@ -123,7 +130,9 @@

Source code for data_juicer.ops.mapper.remove_non_chinese_character_mapperrepl=r'', string=text, flags=re.DOTALL) - return samples

+ return samples
+
+

diff --git a/_modules/data_juicer/ops/mapper/remove_repeat_sentences_mapper.html b/_modules/data_juicer/ops/mapper/remove_repeat_sentences_mapper.html index 961eb231d..f62f999b8 100644 --- a/_modules/data_juicer/ops/mapper/remove_repeat_sentences_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_repeat_sentences_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_repeat_sentences_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_repeat_sentences_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -90,13 +90,17 @@

Source code for data_juicer.ops.mapper.remove_repeat_sentences_mapper

return text.split('\n') -
[docs]@OPERATORS.register_module('remove_repeat_sentences_mapper') +
+[docs] +@OPERATORS.register_module('remove_repeat_sentences_mapper') class RemoveRepeatSentencesMapper(Mapper): """Mapper to remove repeat sentences in text samples.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, @@ -124,7 +128,10 @@

Source code for data_juicer.ops.mapper.remove_repeat_sentences_mapper

self.remove_regex = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]' ) if ignore_special_character else None
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): lines = [e for e in text.split('\n')] new_lines = [] @@ -149,7 +156,9 @@

Source code for data_juicer.ops.mapper.remove_repeat_sentences_mapper

samples[self.text_key][idx] = '\n'.join(new_lines) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html b/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html index 5430bcee8..647c10537 100644 --- a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_specific_chars_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_specific_chars_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,13 +84,17 @@

Source code for data_juicer.ops.mapper.remove_specific_chars_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('remove_specific_chars_mapper') +
+[docs] +@OPERATORS.register_module('remove_specific_chars_mapper') class RemoveSpecificCharsMapper(Mapper): """Mapper to clean specific chars in text samples.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs): @@ -109,7 +113,10 @@

Source code for data_juicer.ops.mapper.remove_specific_chars_mapper

else: self.pattern = None
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): if self.pattern is None: return samples @@ -119,7 +126,9 @@

Source code for data_juicer.ops.mapper.remove_specific_chars_mapper

string=text, flags=re.DOTALL) for text in samples[self.text_key] ] - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html b/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html index bedd9d40e..342d5a05d 100644 --- a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_table_text_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_table_text_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,7 +84,9 @@

Source code for data_juicer.ops.mapper.remove_table_text_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('remove_table_text_mapper') +
+[docs] +@OPERATORS.register_module('remove_table_text_mapper') class RemoveTableTextMapper(Mapper): """ Mapper to remove table texts from text samples. @@ -95,7 +97,9 @@

Source code for data_juicer.ops.mapper.remove_table_text_mapper

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_col: Annotated[int, Field(ge=2, le=20)] = 2, max_col: Annotated[int, Field(ge=2, le=20)] = 20, *args, @@ -113,7 +117,10 @@

Source code for data_juicer.ops.mapper.remove_table_text_mapper

self.max_col = max_col self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}'
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): for i in range(self.min_col - 1, self.max_col): pattern = re.compile(self.pattern % i) @@ -121,7 +128,9 @@

Source code for data_juicer.ops.mapper.remove_table_text_mapper

samples[self.text_key][idx] = text - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html b/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html index f9c87eeb4..800168143 100644 --- a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html +++ b/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -89,13 +89,17 @@

Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substring OP_NAME = 'remove_words_with_incorrect_substrings_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class RemoveWordsWithIncorrectSubstringsMapper(Mapper): """Mapper to remove words with incorrect substrings.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, lang: str = 'en', tokenization: bool = False, substrings: Optional[List[str]] = None, @@ -120,12 +124,18 @@

Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substring self.model_key = prepare_model(model_type='sentencepiece', lang=lang)

-
[docs] def should_keep_word_with_incorrect_substrings(self, word, substrings): + +
+[docs] + def should_keep_word_with_incorrect_substrings(self, word, substrings): word = strip(word, SPECIAL_CHARACTERS) should_keep = all([(i_substr not in word) for i_substr in substrings]) return should_keep
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): if self.tokenization: tokenizer = get_model(self.model_key) @@ -151,7 +161,9 @@

Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substring samples[self.text_key][idx] = text - return samples

+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/replace_content_mapper.html b/_modules/data_juicer/ops/mapper/replace_content_mapper.html index e49efaf91..80952782b 100644 --- a/_modules/data_juicer/ops/mapper/replace_content_mapper.html +++ b/_modules/data_juicer/ops/mapper/replace_content_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.replace_content_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.replace_content_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -84,7 +84,9 @@

Source code for data_juicer.ops.mapper.replace_content_mapper

from ..base_op import OPERATORS, Mapper -
[docs]@OPERATORS.register_module('replace_content_mapper') +
+[docs] +@OPERATORS.register_module('replace_content_mapper') class ReplaceContentMapper(Mapper): """Mapper to replace all content in the text that matches a specific regular expression pattern with a designated @@ -92,7 +94,9 @@

Source code for data_juicer.ops.mapper.replace_content_mapper

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, pattern: Union[str, List[str], None] = None, repl: Union[str, List[str]] = '', *args, @@ -115,6 +119,7 @@

Source code for data_juicer.ops.mapper.replace_content_mapper

for p in pattern: self.compiled_patterns.append(self._prepare_pattern(p))
+ def _prepare_pattern(self, pattern: str) -> re.Pattern: """Prepare the regular expression pattern.""" if ((pattern is not None and len(pattern) > 2) @@ -123,7 +128,9 @@

Source code for data_juicer.ops.mapper.replace_content_mapper

pattern = pattern[2:-1] return re.compile(pattern, flags=re.DOTALL) -
[docs] def process_batched(self, samples): +
+[docs] + def process_batched(self, samples): if self.pattern is None: return samples @@ -142,7 +149,9 @@

Source code for data_juicer.ops.mapper.replace_content_mapper

samples[self.text_key][idx] = text - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html b/_modules/data_juicer/ops/mapper/sentence_split_mapper.html index 07013d010..0199e6308 100644 --- a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html +++ b/_modules/data_juicer/ops/mapper/sentence_split_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.sentence_split_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.sentence_split_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -85,13 +85,17 @@

Source code for data_juicer.ops.mapper.sentence_split_mapper

OP_NAME = 'sentence_split_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class SentenceSplitMapper(Mapper): """Mapper to split text samples to sentences.""" _batched_op = True -
[docs] def __init__(self, lang: str = 'en', *args, **kwargs): +
+[docs] + def __init__(self, lang: str = 'en', *args, **kwargs): """ Initialization method. @@ -103,7 +107,10 @@

Source code for data_juicer.ops.mapper.sentence_split_mapper

self.lang = lang self.model_key = prepare_model(model_type='nltk', lang=lang)
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): nltk_model = get_model(self.model_key) @@ -113,7 +120,9 @@

Source code for data_juicer.ops.mapper.sentence_split_mapper

for text in samples[self.text_key] ] - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/text_chunk_mapper.html b/_modules/data_juicer/ops/mapper/text_chunk_mapper.html index e8a5ca713..9f6e879f5 100644 --- a/_modules/data_juicer/ops/mapper/text_chunk_mapper.html +++ b/_modules/data_juicer/ops/mapper/text_chunk_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.text_chunk_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.text_chunk_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -90,13 +90,17 @@

Source code for data_juicer.ops.mapper.text_chunk_mapper

OP_NAME = 'text_chunk_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class TextChunkMapper(Mapper): """Split input text to chunks.""" _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, max_len: Union[PositiveInt, None] = None, split_pattern: Union[str, None] = r'\n\n', overlap_len: NonNegativeInt = 0, @@ -141,7 +145,10 @@

Source code for data_juicer.ops.mapper.text_chunk_mapper

return_processor=True, processor_config={'trust_remote_code': trust_remote_code})
-
[docs] def recursively_chunk(self, text): + +
+[docs] + def recursively_chunk(self, text): if self.tokenizer_name is not None: _, tokenizer = get_model(self.model_key) tokens = tokenizer.encode(text) @@ -169,7 +176,10 @@

Source code for data_juicer.ops.mapper.text_chunk_mapper

return [cur_text] + self.recursively_chunk(left_text)
-
[docs] def get_text_chunks(self, text, rank=None): + +
+[docs] + def get_text_chunks(self, text, rank=None): if self.split_pattern is not None and self.max_len is None: chunks = re.split(f'({self.split_pattern})', text) @@ -194,7 +204,10 @@

Source code for data_juicer.ops.mapper.text_chunk_mapper

return chunks
-
[docs] def process_batched(self, samples, rank=None): + +
+[docs] + def process_batched(self, samples, rank=None): sample_num = len(samples[self.text_key]) @@ -212,7 +225,9 @@

Source code for data_juicer.ops.mapper.text_chunk_mapper

for key in samples: samples[key] = list(chain(*samples[key])) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_audio_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_audio_mapper.html index 93f2d8b81..775a52078 100644 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_audio_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_captioning_from_audio_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_captioning_from_audio_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_captioning_from_audio_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -91,7 +91,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_audio_mapperNAME = 'video_captioning_from_audio_mapper' -
[docs]@OPERATORS.register_module(NAME) +
+[docs] +@OPERATORS.register_module(NAME) class VideoCaptioningFromAudioMapper(Mapper): """Mapper to caption a video according to its audio streams based on Qwen-Audio model. @@ -100,7 +102,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_audio_mapper_accelerator = 'cuda' _batched_op = True -
[docs] def __init__(self, keep_original_sample: bool = True, *args, **kwargs): +
+[docs] + def __init__(self, keep_original_sample: bool = True, *args, **kwargs): """ Initialization method. @@ -130,6 +134,7 @@

Source code for data_juicer.ops.mapper.video_captioning_from_audio_mapper'<|unkown|><|notimestamps|><|wo_itn|>' self.response_remove_pattern = re.compile(r'<\|.*?\|>')

+ def _process_single_sample(self, sample, rank=None): # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: @@ -195,7 +200,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_audio_mappercaptioned_sample[self.video_key] = left_video_keys return [captioned_sample] -
[docs] def process_batched(self, samples, rank=None): +
+[docs] + def process_batched(self, samples, rank=None): # reconstruct samples from "dict of lists" to "list of dicts" reconstructed_samples = [] for i in range(len(samples[self.text_key])): @@ -217,7 +224,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_audio_mapperfor key in keys: res_samples[key] = [s[key] for s in samples_after_split] - return res_samples

+ return res_samples

+

+

diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html index 69dad9ae5..478d0d3ae 100644 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_captioning_from_frames_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_captioning_from_frames_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -106,7 +106,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapperOP_NAME = 'video_captioning_from_frames_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoCaptioningFromFramesMapper(Mapper): """Mapper to generate samples whose captions are generated based on @@ -116,7 +118,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapper_accelerator = 'cuda' _batched_op = True -
[docs] def __init__( +
+[docs] + def __init__( self, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, @@ -234,6 +238,7 @@

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mappertrust_remote_code=trust_remote_code )

+ def _process_single_sample(self, ori_sample, rank=None, context=False): # there is no videos in this sample @@ -403,7 +408,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mappergenerated_text_candidates_single_chunk[max_index]) return generated_text_per_chunk -
[docs] def process_batched(self, samples, rank=None, context=False): +
+[docs] + def process_batched(self, samples, rank=None, context=False): """ :param samples: :return: @@ -438,7 +445,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapperfor key in keys: res_samples[key] = [s[key] for s in samples_after_generation] - return res_samples

+ return res_samples

+

+

diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.html index f8efe5183..ae2793fc5 100644 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_captioning_from_summarizer_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_captioning_from_summarizer_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -92,7 +92,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_summarizer_mapp NAME = 'video_captioning_from_summarizer_mapper' -
[docs]@OPERATORS.register_module(NAME) +
+[docs] +@OPERATORS.register_module(NAME) class VideoCaptioningFromSummarizerMapper(Mapper): """ Mapper to generate video captions by summarizing several kinds of generated @@ -102,7 +104,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_summarizer_mapp _accelerator = 'cuda' _batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_summarizer: str = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, @@ -230,6 +234,7 @@

Source code for data_juicer.ops.mapper.video_captioning_from_summarizer_mapp self.keep_tag_num = keep_tag_num

+ def _prepare_op_args(self, op_class, args_dict): required_args = set(op_class.__init__.__code__.co_varnames) args_dict.update(self.FIXED_ARGS) @@ -312,7 +317,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_summarizer_mapp captioned_sample[self.text_key] = captioned_texts return [captioned_sample] -
[docs] def process_batched(self, samples, rank=None): +
+[docs] + def process_batched(self, samples, rank=None): # reconstruct samples from "dict of lists" to "list of dicts" reconstructed_samples = [] for i in range(len(samples[self.text_key])): @@ -334,7 +341,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_summarizer_mapp for key in keys: res_samples[key] = [s[key] for s in samples_after_split] - return res_samples

+ return res_samples

+

+

diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html index 0ce43d586..53721628e 100644 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_captioning_from_video_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_captioning_from_video_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -106,7 +106,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_video_mapperOP_NAME = 'video_captioning_from_video_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoCaptioningFromVideoMapper(Mapper): """Mapper to generate samples whose captions are generated based on @@ -115,7 +117,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_video_mapper_accelerator = 'cuda' _batched_op = True -
[docs] def __init__( +
+[docs] + def __init__( self, hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, @@ -234,6 +238,7 @@

Source code for data_juicer.ops.mapper.video_captioning_from_video_mappertrust_remote_code=trust_remote_code )

+ def _process_single_sample(self, ori_sample, rank=None, context=False): # there is no videos in this sample @@ -410,7 +415,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_video_mappergenerated_text_candidates_single_chunk[max_index]) return generated_text_per_chunk -
[docs] def process_batched(self, samples, rank=None, context=False): +
+[docs] + def process_batched(self, samples, rank=None, context=False): """ :param samples: :return: @@ -445,7 +452,9 @@

Source code for data_juicer.ops.mapper.video_captioning_from_video_mapperfor key in keys: res_samples[key] = [s[key] for s in samples_after_generation] - return res_samples

+ return res_samples

+

+

diff --git a/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html b/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html index 4c630cb4a..0a29f4980 100644 --- a/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_face_blur_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_face_blur_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -98,7 +98,9 @@

Source code for data_juicer.ops.mapper.video_face_blur_mapper

OP_NAME = 'video_face_blur_mapper' -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoFaceBlurMapper(Mapper): @@ -112,7 +114,9 @@

Source code for data_juicer.ops.mapper.video_face_blur_mapper

'maxSize': None, } -
[docs] def __init__(self, +
+[docs] + def __init__(self, cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, @@ -160,7 +164,10 @@

Source code for data_juicer.ops.mapper.video_face_blur_mapper

self.model_key = prepare_model(model_type='opencv_classifier', model_path=cv_classifier)
-
[docs] def process_single(self, sample, context=False): + +
+[docs] + def process_single(self, sample, context=False): # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: sample[Fields.source_file] = [] @@ -211,7 +218,9 @@

Source code for data_juicer.ops.mapper.video_face_blur_mapper

sample[self.video_key] = [ processed_video_keys[key] for key in loaded_video_keys ] - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.html b/_modules/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.html index e8dffca81..1afe530dd 100644 --- a/_modules/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -92,12 +92,16 @@

Source code for data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper

OP_NAME = 'video_ffmpeg_wrapped_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class VideoFFmpegWrappedMapper(Mapper): """Simple wrapper for FFmpeg video filters. """ -
[docs] def __init__( +
+[docs] + def __init__( self, filter_name: Optional[str] = None, filter_kwargs: Optional[Dict] = None, @@ -127,7 +131,10 @@

Source code for data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper

self.capture_stderr = capture_stderr self.overwrite_output = overwrite_output
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: sample[Fields.source_file] = [] @@ -162,7 +169,9 @@

Source code for data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper

sample[Fields.source_file][i] = value sample[self.video_key] = [processed[key] for key in loaded_video_keys] - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html b/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html index c41d465a0..2c216a15c 100644 --- a/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_remove_watermark_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_remove_watermark_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -103,14 +103,18 @@

Source code for data_juicer.ops.mapper.video_remove_watermark_mapper

OP_NAME = 'video_remove_watermark_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoRemoveWatermarkMapper(Mapper): """ Remove the watermarks in videos given regions. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: Optional[str] = None, @@ -183,6 +187,7 @@

Source code for data_juicer.ops.mapper.video_remove_watermark_mapper

self.min_frame_threshold = min_frame_threshold self.detection_method = detection_method
+ def _detect_watermark_via_pixel_value(self, frames, rois): masks = [] @@ -280,7 +285,9 @@

Source code for data_juicer.ops.mapper.video_remove_watermark_mapper

new_np_frame = cv2.inpaint(np_frame, watermark_mask, 3, cv2.INPAINT_NS) return av.VideoFrame.from_ndarray(new_np_frame, format='bgr24') -
[docs] def process_single(self, sample, context=False): +
+[docs] + def process_single(self, sample, context=False): # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: sample[Fields.source_file] = [] @@ -322,7 +329,9 @@

Source code for data_juicer.ops.mapper.video_remove_watermark_mapper

sample[Fields.source_file][i] = value sample[self.video_key] = loaded_video_keys - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html b/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html index c1d43d34f..bca5bc01c 100644 --- a/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_resize_aspect_ratio_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_resize_aspect_ratio_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -132,7 +132,9 @@

Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper

return scaled_width, scaled_height -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class VideoResizeAspectRatioMapper(Mapper): """Mapper to resize videos by aspect ratio. AspectRatio = W / H. @@ -140,7 +142,9 @@

Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper

STRATEGY = ['decrease', 'increase'] -
[docs] def __init__( +
+[docs] + def __init__( self, min_ratio: str = '9/21', max_ratio: str = '21/9', @@ -179,7 +183,10 @@

Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper

self.max_ratio = Fraction(str(max_ratio).replace(':', '/')) self.strategy = strategy
-
[docs] def process_single(self, sample): + +
+[docs] + def process_single(self, sample): # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: sample[Fields.source_file] = [] @@ -230,7 +237,9 @@

Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper

sample[Fields.source_file][i] = value sample[self.video_key] = loaded_video_keys - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html b/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html index 207add7cf..d6d85e22b 100644 --- a/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_resize_resolution_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_resize_resolution_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -98,7 +98,9 @@

Source code for data_juicer.ops.mapper.video_resize_resolution_mapper

OP_NAME = 'video_resize_resolution_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoResizeResolutionMapper(Mapper): """ @@ -106,7 +108,9 @@

Source code for data_juicer.ops.mapper.video_resize_resolution_mapper

with deep learning for future works. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, min_width: int = 1, max_width: int = sys.maxsize, min_height: int = 1, @@ -163,7 +167,10 @@

Source code for data_juicer.ops.mapper.video_resize_resolution_mapper

self.force_original_aspect_ratio = force_original_aspect_ratio self.force_divisible_by = force_divisible_by
-
[docs] def process_single(self, sample, context=False): + +
+[docs] + def process_single(self, sample, context=False): # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: sample[Fields.source_file] = [] @@ -254,7 +261,9 @@

Source code for data_juicer.ops.mapper.video_resize_resolution_mapper

sample[Fields.source_file][i] = value sample[self.video_key] = loaded_video_keys - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html index a1b75593d..ad7a11f42 100644 --- a/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_split_by_duration_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_split_by_duration_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -104,7 +104,9 @@

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

OP_NAME = 'video_split_by_duration_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoSplitByDurationMapper(Mapper): """Mapper to split video by duration. @@ -112,7 +114,9 @@

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

_batched_op = True -
[docs] def __init__(self, +
+[docs] + def __init__(self, split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, @@ -140,7 +144,10 @@

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

self.keep_original_sample = keep_original_sample self.extra_args = kwargs
-
[docs] def split_videos_by_duration(self, video_key, container): + +
+[docs] + def split_videos_by_duration(self, video_key, container): video_duration = get_video_duration(container) timestamps = np.arange(0, video_duration, self.split_duration).tolist() count = 0 @@ -164,6 +171,7 @@

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

split_video_keys.append(split_video_key) return split_video_keys
+ def _process_single_sample(self, sample): # there is no video in this sample if self.video_key not in sample or sample[ @@ -222,7 +230,9 @@

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

split_sample[self.video_key] = split_video_keys return [split_sample] -
[docs] def process_batched(self, samples): +
+[docs] + def process_batched(self, samples): # reconstruct samples from "dict of lists" to "list of dicts" reconstructed_samples = [] for i in range(len(samples[self.text_key])): @@ -242,7 +252,9 @@

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

res_samples = {} for key in keys: res_samples[key] = [s[key] for s in samples_after_split] - return res_samples
+ return res_samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html index bdbcc97d7..cca86af1c 100644 --- a/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_split_by_key_frame_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_split_by_key_frame_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -102,7 +102,9 @@

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

< OP_NAME = 'video_split_by_key_frame_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoSplitByKeyFrameMapper(Mapper): """Mapper to split video by key frame. @@ -110,7 +112,9 @@

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

< _batched_op = True -
[docs] def __init__(self, keep_original_sample: bool = True, *args, **kwargs): +
+[docs] + def __init__(self, keep_original_sample: bool = True, *args, **kwargs): """ Initialization method. @@ -127,7 +131,10 @@

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

< self.keep_original_sample = keep_original_sample self.extra_args = kwargs
-
[docs] def get_split_key_frame(self, video_key, container): + +
+[docs] + def get_split_key_frame(self, video_key, container): timestamps = get_key_frame_seconds(container) count = 0 @@ -147,6 +154,7 @@

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

< split_video_keys.append(split_video_key) return split_video_keys
+ def _process_single_sample(self, sample): # there is no video in this sample if self.video_key not in sample or sample[ @@ -204,7 +212,9 @@

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

< split_sample[self.video_key] = split_video_keys return [split_sample] -
[docs] def process_batched(self, samples): +
+[docs] + def process_batched(self, samples): # reconstruct samples from "dict of lists" to "list of dicts" reconstructed_samples = [] for i in range(len(samples[self.text_key])): @@ -225,7 +235,9 @@

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

< for key in keys: res_samples[key] = [s[key] for s in samples_after_split] - return res_samples
+ return res_samples
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_split_by_scene_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_scene_mapper.html index 3056e0da9..1abe539aa 100644 --- a/_modules/data_juicer/ops/mapper/video_split_by_scene_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_split_by_scene_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_split_by_scene_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_split_by_scene_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -104,7 +104,9 @@

Source code for data_juicer.ops.mapper.video_split_by_scene_mapper

return match.group(0) -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class VideoSplitBySceneMapper(Mapper): """Mapper to cut videos into scene clips. """ @@ -120,7 +122,9 @@

Source code for data_juicer.ops.mapper.video_split_by_scene_mapper

['fade_bias', 'add_final_scene', 'method', 'block_size'] } -
[docs] def __init__(self, +
+[docs] + def __init__(self, detector: str = 'ContentDetector', threshold: NonNegativeFloat = 27.0, min_scene_len: NonNegativeInt = 15, @@ -159,7 +163,10 @@

Source code for data_juicer.ops.mapper.video_split_by_scene_mapper

for key in avaliable_kwargs if key in kwargs }
-
[docs] def process_single(self, sample, context=False): + +
+[docs] + def process_single(self, sample, context=False): # there is no video in this sample if self.video_key not in sample or not sample[self.video_key]: sample[Fields.source_file] = [] @@ -226,7 +233,9 @@

Source code for data_juicer.ops.mapper.video_split_by_scene_mapper

sample[self.video_key] = list( chain.from_iterable( [output_video_keys[key] for key in loaded_video_keys])) - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_tagging_from_audio_mapper.html b/_modules/data_juicer/ops/mapper/video_tagging_from_audio_mapper.html index f8d3311c8..0b01d52b4 100644 --- a/_modules/data_juicer/ops/mapper/video_tagging_from_audio_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_tagging_from_audio_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_tagging_from_audio_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_tagging_from_audio_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -92,7 +92,9 @@

Source code for data_juicer.ops.mapper.video_tagging_from_audio_mapper

< OP_NAME = 'video_tagging_from_audio_mapper' -
[docs]@OPERATORS.register_module(OP_NAME) +
+[docs] +@OPERATORS.register_module(OP_NAME) class VideoTaggingFromAudioMapper(Mapper): """Mapper to generate video tags from audio streams extracted by video using the Audio Spectrogram Transformer. @@ -100,7 +102,9 @@

Source code for data_juicer.ops.mapper.video_tagging_from_audio_mapper

< _accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = Fields.video_audio_tags, @@ -126,7 +130,10 @@

Source code for data_juicer.ops.mapper.video_tagging_from_audio_mapper

< self.tag_field_name = tag_field_name
-
[docs] def process_single(self, sample, rank=None): + +
+[docs] + def process_single(self, sample, rank=None): # check if it's generated already if self.tag_field_name in sample: return sample @@ -169,7 +176,9 @@

Source code for data_juicer.ops.mapper.video_tagging_from_audio_mapper

< predicted_tag = model.config.id2label[predicted_tag_id] video_audio_tags.append(predicted_tag) sample[self.tag_field_name] = np.array(video_audio_tags, dtype=np.str_) - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html b/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html index ec282b001..c03e41de6 100644 --- a/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.video_tagging_from_frames_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.video_tagging_from_frames_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -98,7 +98,9 @@

Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper

OP_NAME = 'video_tagging_from_frames_mapper' -
[docs]@UNFORKABLE.register_module(OP_NAME) +
+[docs] +@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) class VideoTaggingFromFramesMapper(Mapper): @@ -107,7 +109,9 @@

Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper

_accelerator = 'cuda' -
[docs] def __init__(self, +
+[docs] + def __init__(self, frame_sampling_method: str = 'all_keyframes', frame_num: PositiveInt = 3, tag_field_name: str = Fields.video_frame_tags, @@ -149,7 +153,10 @@

Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper

self.tag_field_name = tag_field_name
-
[docs] def process_single(self, sample, rank=None, context=False): + +
+[docs] + def process_single(self, sample, rank=None, context=False): # check if it's generated already if self.tag_field_name in sample: return sample @@ -194,7 +201,9 @@

Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper

close_video(videos[vid_key]) sample[self.tag_field_name] = video_tags - return sample
+ return sample
+
+
diff --git a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html b/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html index 637aa3c30..fd11c3bf5 100644 --- a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html +++ b/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.mapper.whitespace_normalization_mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper.whitespace_normalization_mapper — data_juicer 1.0.1 documentation - - - + + + @@ -85,7 +85,9 @@

Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

< from ..common.special_characters import VARIOUS_WHITESPACES -
[docs]@OPERATORS.register_module('whitespace_normalization_mapper') +
+[docs] +@OPERATORS.register_module('whitespace_normalization_mapper') class WhitespaceNormalizationMapper(Mapper): """ Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20) @@ -97,7 +99,9 @@

Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

< _batched_op = True -
[docs] def __init__(self, *args, **kwargs): +
+[docs] + def __init__(self, *args, **kwargs): """ Initialization method. @@ -106,7 +110,10 @@

Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

< """ super().__init__(*args, **kwargs)
-
[docs] def process_batched(self, samples): + +
+[docs] + def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): # remove whitespaces before and after the main content text = text.strip() @@ -117,7 +124,9 @@

Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

< for char in text ]) - return samples
+ return samples
+
+
diff --git a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html b/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html index 16bcf4370..a1600e4e9 100644 --- a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html +++ b/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.selector.frequency_specified_field_selector — data_juicer 1.0.0 documentation + data_juicer.ops.selector.frequency_specified_field_selector — data_juicer 1.0.1 documentation - - - + + + @@ -86,12 +86,16 @@

Source code for data_juicer.ops.selector.frequency_specified_field_selector< from ..base_op import OPERATORS, Selector -
[docs]@OPERATORS.register_module('frequency_specified_field_selector') +
+[docs] +@OPERATORS.register_module('frequency_specified_field_selector') class FrequencySpecifiedFieldSelector(Selector): """Selector to select samples based on the sorted frequency of specified field.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, field_key: str = '', top_ratio: Optional[Annotated[float, Field(ge=0, le=1)]] = None, @@ -127,7 +131,10 @@

Source code for data_juicer.ops.selector.frequency_specified_field_selector< self.topk = topk self.reverse = reverse

-
[docs] def process(self, dataset): + +
+[docs] + def process(self, dataset): if len(dataset) <= 1 or not self.field_key: return dataset @@ -166,7 +173,9 @@

Source code for data_juicer.ops.selector.frequency_specified_field_selector< sorted(field_value_dict.values(), key=lambda x: len(x), reverse=self.reverse)[:int(select_num)], []) - return dataset.select(select_index)

+ return dataset.select(select_index)
+
+

diff --git a/_modules/data_juicer/ops/selector/random_selector.html b/_modules/data_juicer/ops/selector/random_selector.html index 07518ba68..33edbd4f7 100644 --- a/_modules/data_juicer/ops/selector/random_selector.html +++ b/_modules/data_juicer/ops/selector/random_selector.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.selector.random_selector — data_juicer 1.0.0 documentation + data_juicer.ops.selector.random_selector — data_juicer 1.0.1 documentation - - - + + + @@ -87,11 +87,15 @@

Source code for data_juicer.ops.selector.random_selector

from ..base_op import OPERATORS, Selector -
[docs]@OPERATORS.register_module('random_selector') +
+[docs] +@OPERATORS.register_module('random_selector') class RandomSelector(Selector): """Selector to random select samples. """ -
[docs] def __init__(self, +
+[docs] + def __init__(self, select_ratio: Optional[Annotated[float, Field(ge=0, le=1)]] = None, select_num: PositiveInt = None, @@ -113,7 +117,10 @@

Source code for data_juicer.ops.selector.random_selector

self.select_ratio = select_ratio self.select_num = select_num
-
[docs] def process(self, dataset): + +
+[docs] + def process(self, dataset): if len(dataset) <= 1: return dataset @@ -129,7 +136,9 @@

Source code for data_juicer.ops.selector.random_selector

select_num = self.select_num return MixtureFormatter.random_sample(dataset, - sample_number=select_num)
+ sample_number=select_num)
+
+
diff --git a/_modules/data_juicer/ops/selector/range_specified_field_selector.html b/_modules/data_juicer/ops/selector/range_specified_field_selector.html index 8b20f24dc..ec6a10e06 100644 --- a/_modules/data_juicer/ops/selector/range_specified_field_selector.html +++ b/_modules/data_juicer/ops/selector/range_specified_field_selector.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.selector.range_specified_field_selector — data_juicer 1.0.0 documentation + data_juicer.ops.selector.range_specified_field_selector — data_juicer 1.0.1 documentation - - - + + + @@ -88,12 +88,16 @@

Source code for data_juicer.ops.selector.range_specified_field_selector

from ..base_op import OPERATORS, Selector -
[docs]@OPERATORS.register_module('range_specified_field_selector') +
+[docs] +@OPERATORS.register_module('range_specified_field_selector') class RangeSpecifiedFieldSelector(Selector): """Selector to select a range of samples based on the sorted specified field value from smallest to largest. """ -
[docs] def __init__( +
+[docs] + def __init__( self, field_key: str = '', lower_percentile: Optional[Annotated[float, @@ -141,7 +145,10 @@

Source code for data_juicer.ops.selector.range_specified_field_selector

self.lower_rank = lower_rank self.upper_rank = upper_rank
-
[docs] def process(self, dataset): + +
+[docs] + def process(self, dataset): if len(dataset) <= 1 or not self.field_key: return dataset @@ -190,7 +197,9 @@

Source code for data_juicer.ops.selector.range_specified_field_selector

range(len(sub_dataset)), field_value_list.__getitem__) - return sub_dataset.select(select_index)
+ return sub_dataset.select(select_index)
+
+
diff --git a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html index 92e8efe77..3c9ab12f4 100644 --- a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html +++ b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html @@ -1,18 +1,18 @@ - + - data_juicer.ops.selector.topk_specified_field_selector — data_juicer 1.0.0 documentation + data_juicer.ops.selector.topk_specified_field_selector — data_juicer 1.0.1 documentation - - - + + + @@ -88,12 +88,16 @@

Source code for data_juicer.ops.selector.topk_specified_field_selector

< from ..base_op import OPERATORS, Selector -
[docs]@OPERATORS.register_module('topk_specified_field_selector') +
+[docs] +@OPERATORS.register_module('topk_specified_field_selector') class TopkSpecifiedFieldSelector(Selector): """Selector to select top samples based on the sorted specified field value.""" -
[docs] def __init__(self, +
+[docs] + def __init__(self, field_key: str = '', top_ratio: Optional[Annotated[float, Field(ge=0, le=1)]] = None, @@ -129,7 +133,10 @@

Source code for data_juicer.ops.selector.topk_specified_field_selector

< self.topk = topk self.reverse = reverse
-
[docs] def process(self, dataset): + +
+[docs] + def process(self, dataset): if len(dataset) <= 1 or not self.field_key: return dataset @@ -168,7 +175,9 @@

Source code for data_juicer.ops.selector.topk_specified_field_selector

< select_index = heapq.nsmallest(int(select_num), range(len(dataset)), field_value_list.__getitem__) - return dataset.select(select_index)
+ return dataset.select(select_index)
+
+
diff --git a/_modules/index.html b/_modules/index.html index c9f624a03..7cd407189 100644 --- a/_modules/index.html +++ b/_modules/index.html @@ -1,18 +1,18 @@ - + - Overview: module code — data_juicer 1.0.0 documentation + Overview: module code — data_juicer 1.0.1 documentation - - - + + + @@ -184,6 +184,8 @@

All modules for which code is available

  • data_juicer.ops.mapper.optimize_response_mapper
  • data_juicer.ops.mapper.pair_preference_mapper
  • data_juicer.ops.mapper.punctuation_normalization_mapper
  • +
  • data_juicer.ops.mapper.python_file_mapper
  • +
  • data_juicer.ops.mapper.python_lambda_mapper
  • data_juicer.ops.mapper.remove_bibliography_mapper
  • data_juicer.ops.mapper.remove_comments_mapper
  • data_juicer.ops.mapper.remove_header_mapper
  • diff --git a/_static/basic.css b/_static/basic.css index cfc60b86c..7ebbd6d07 100644 --- a/_static/basic.css +++ b/_static/basic.css @@ -1,12 +1,5 @@ /* - * basic.css - * ~~~~~~~~~ - * * Sphinx stylesheet -- basic theme. - * - * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ /* -- main layout ----------------------------------------------------------- */ @@ -115,15 +108,11 @@ img { /* -- search page ----------------------------------------------------------- */ ul.search { - margin: 10px 0 0 20px; - padding: 0; + margin-top: 10px; } ul.search li { - padding: 5px 0 5px 20px; - background-image: url(file.png); - background-repeat: no-repeat; - background-position: 0 7px; + padding: 5px 0; } ul.search li a { @@ -237,6 +226,10 @@ a.headerlink { visibility: hidden; } +a:visited { + color: #551A8B; +} + h1:hover > a.headerlink, h2:hover > a.headerlink, h3:hover > a.headerlink, diff --git a/_static/doctools.js b/_static/doctools.js index d06a71d75..0398ebb9f 100644 --- a/_static/doctools.js +++ b/_static/doctools.js @@ -1,12 +1,5 @@ /* - * doctools.js - * ~~~~~~~~~~~ - * * Base JavaScript utilities for all Sphinx HTML documentation. - * - * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; diff --git a/_static/documentation_options.js b/_static/documentation_options.js index 995f333f6..d94acc6bd 100644 --- a/_static/documentation_options.js +++ b/_static/documentation_options.js @@ -1,6 +1,5 @@ -var DOCUMENTATION_OPTIONS = { - URL_ROOT: document.getElementById("documentation_options").getAttribute('data-url_root'), - VERSION: '1.0.0', +const DOCUMENTATION_OPTIONS = { + VERSION: '1.0.1', LANGUAGE: 'en', COLLAPSE_INDEX: false, BUILDER: 'html', diff --git a/_static/language_data.js b/_static/language_data.js index 250f5665f..c7fe6c6fa 100644 --- a/_static/language_data.js +++ b/_static/language_data.js @@ -1,19 +1,12 @@ /* - * language_data.js - * ~~~~~~~~~~~~~~~~ - * * This script contains the language-specific data used by searchtools.js, * namely the list of stopwords, stemmer, scorer and splitter. - * - * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"]; -/* Non-minified version is copied as a separate JS file, is available */ +/* Non-minified version is copied as a separate JS file, if available */ /** * Porter Stemmer diff --git a/_static/searchtools.js b/_static/searchtools.js index 97d56a74d..2c774d17a 100644 --- a/_static/searchtools.js +++ b/_static/searchtools.js @@ -1,12 +1,5 @@ /* - * searchtools.js - * ~~~~~~~~~~~~~~~~ - * * Sphinx JavaScript utilities for the full-text search. - * - * :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS. - * :license: BSD, see LICENSE for details. - * */ "use strict"; @@ -20,7 +13,7 @@ if (typeof Scorer === "undefined") { // and returns the new score. /* score: result => { - const [docname, title, anchor, descr, score, filename] = result + const [docname, title, anchor, descr, score, filename, kind] = result return score }, */ @@ -47,6 +40,14 @@ if (typeof Scorer === "undefined") { }; } +// Global search result kind enum, used by themes to style search results. +class SearchResultKind { + static get index() { return "index"; } + static get object() { return "object"; } + static get text() { return "text"; } + static get title() { return "title"; } +} + const _removeChildren = (element) => { while (element && element.lastChild) element.removeChild(element.lastChild); }; @@ -57,16 +58,20 @@ const _removeChildren = (element) => { const _escapeRegExp = (string) => string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string -const _displayItem = (item, searchTerms) => { +const _displayItem = (item, searchTerms, highlightTerms) => { const docBuilder = DOCUMENTATION_OPTIONS.BUILDER; - const docUrlRoot = DOCUMENTATION_OPTIONS.URL_ROOT; const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX; const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX; const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY; + const contentRoot = document.documentElement.dataset.content_root; - const [docName, title, anchor, descr, score, _filename] = item; + const [docName, title, anchor, descr, score, _filename, kind] = item; let listItem = document.createElement("li"); + // Add a class representing the item's type: + // can be used by a theme's CSS selector for styling + // See SearchResultKind for the class names. + listItem.classList.add(`kind-${kind}`); let requestUrl; let linkUrl; if (docBuilder === "dirhtml") { @@ -75,28 +80,35 @@ const _displayItem = (item, searchTerms) => { if (dirname.match(/\/index\/$/)) dirname = dirname.substring(0, dirname.length - 6); else if (dirname === "index/") dirname = ""; - requestUrl = docUrlRoot + dirname; + requestUrl = contentRoot + dirname; linkUrl = requestUrl; } else { // normal html builders - requestUrl = docUrlRoot + docName + docFileSuffix; + requestUrl = contentRoot + docName + docFileSuffix; linkUrl = docName + docLinkSuffix; } let linkEl = listItem.appendChild(document.createElement("a")); linkEl.href = linkUrl + anchor; linkEl.dataset.score = score; linkEl.innerHTML = title; - if (descr) + if (descr) { listItem.appendChild(document.createElement("span")).innerHTML = " (" + descr + ")"; + // highlight search terms in the description + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); + } else if (showSearchSummary) fetch(requestUrl) .then((responseData) => responseData.text()) .then((data) => { if (data) listItem.appendChild( - Search.makeSearchSummary(data, searchTerms) + Search.makeSearchSummary(data, searchTerms, anchor) ); + // highlight search terms in the summary + if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js + highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted")); }); Search.output.appendChild(listItem); }; @@ -108,27 +120,46 @@ const _finishSearch = (resultCount) => { "Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories." ); else - Search.status.innerText = _( - `Search finished, found ${resultCount} page(s) matching the search query.` - ); + Search.status.innerText = Documentation.ngettext( + "Search finished, found one page matching the search query.", + "Search finished, found ${resultCount} pages matching the search query.", + resultCount, + ).replace('${resultCount}', resultCount); }; const _displayNextItem = ( results, resultCount, - searchTerms + searchTerms, + highlightTerms, ) => { // results left, load the summary and display it // this is intended to be dynamic (don't sub resultsCount) if (results.length) { - _displayItem(results.pop(), searchTerms); + _displayItem(results.pop(), searchTerms, highlightTerms); setTimeout( - () => _displayNextItem(results, resultCount, searchTerms), + () => _displayNextItem(results, resultCount, searchTerms, highlightTerms), 5 ); } // search finished, update title and status message else _finishSearch(resultCount); }; +// Helper function used by query() to order search results. +// Each input is an array of [docname, title, anchor, descr, score, filename, kind]. +// Order the results by score (in opposite order of appearance, since the +// `_displayNextItem` function uses pop() to retrieve items) and then alphabetically. +const _orderResultsByScoreThenName = (a, b) => { + const leftScore = a[4]; + const rightScore = b[4]; + if (leftScore === rightScore) { + // same score: sort alphabetically + const leftTitle = a[1].toLowerCase(); + const rightTitle = b[1].toLowerCase(); + if (leftTitle === rightTitle) return 0; + return leftTitle > rightTitle ? -1 : 1; // inverted is intentional + } + return leftScore > rightScore ? 1 : -1; +}; /** * Default splitQuery function. Can be overridden in ``sphinx.search`` with a @@ -152,13 +183,26 @@ const Search = { _queued_query: null, _pulse_status: -1, - htmlToText: (htmlString) => { + htmlToText: (htmlString, anchor) => { const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html'); - htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() }); + for (const removalQuery of [".headerlink", "script", "style"]) { + htmlElement.querySelectorAll(removalQuery).forEach((el) => { el.remove() }); + } + if (anchor) { + const anchorContent = htmlElement.querySelector(`[role="main"] ${anchor}`); + if (anchorContent) return anchorContent.textContent; + + console.warn( + `Anchored content block not found. Sphinx search tries to obtain it via DOM query '[role=main] ${anchor}'. Check your theme or template.` + ); + } + + // if anchor not specified or not found, fall back to main content const docContent = htmlElement.querySelector('[role="main"]'); - if (docContent !== undefined) return docContent.textContent; + if (docContent) return docContent.textContent; + console.warn( - "Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template." + "Content block not found. Sphinx search tries to obtain it via DOM query '[role=main]'. Check your theme or template." ); return ""; }, @@ -211,6 +255,7 @@ const Search = { searchSummary.classList.add("search-summary"); searchSummary.innerText = ""; const searchList = document.createElement("ul"); + searchList.setAttribute("role", "list"); searchList.classList.add("search"); const out = document.getElementById("search-results"); @@ -231,16 +276,7 @@ const Search = { else Search.deferQuery(query); }, - /** - * execute search (requires search index to be loaded) - */ - query: (query) => { - const filenames = Search._index.filenames; - const docNames = Search._index.docnames; - const titles = Search._index.titles; - const allTitles = Search._index.alltitles; - const indexEntries = Search._index.indexentries; - + _parseQuery: (query) => { // stem the search terms and add them to the correct list const stemmer = new Stemmer(); const searchTerms = new Set(); @@ -276,22 +312,40 @@ const Search = { // console.info("required: ", [...searchTerms]); // console.info("excluded: ", [...excludedTerms]); - // array of [docname, title, anchor, descr, score, filename] - let results = []; + return [query, searchTerms, excludedTerms, highlightTerms, objectTerms]; + }, + + /** + * execute search (requires search index to be loaded) + */ + _performSearch: (query, searchTerms, excludedTerms, highlightTerms, objectTerms) => { + const filenames = Search._index.filenames; + const docNames = Search._index.docnames; + const titles = Search._index.titles; + const allTitles = Search._index.alltitles; + const indexEntries = Search._index.indexentries; + + // Collect multiple result groups to be sorted separately and then ordered. + // Each is an array of [docname, title, anchor, descr, score, filename, kind]. + const normalResults = []; + const nonMainIndexResults = []; + _removeChildren(document.getElementById("search-progress")); - const queryLower = query.toLowerCase(); + const queryLower = query.toLowerCase().trim(); for (const [title, foundTitles] of Object.entries(allTitles)) { - if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) { + if (title.toLowerCase().trim().includes(queryLower) && (queryLower.length >= title.length/2)) { for (const [file, id] of foundTitles) { - let score = Math.round(100 * queryLower.length / title.length) - results.push([ + const score = Math.round(Scorer.title * queryLower.length / title.length); + const boost = titles[file] === title ? 1 : 0; // add a boost for document titles + normalResults.push([ docNames[file], titles[file] !== title ? `${titles[file]} > ${title}` : title, id !== null ? "#" + id : "", null, - score, + score + boost, filenames[file], + SearchResultKind.title, ]); } } @@ -300,46 +354,48 @@ const Search = { // search for explicit entries in index directives for (const [entry, foundEntries] of Object.entries(indexEntries)) { if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) { - for (const [file, id] of foundEntries) { - let score = Math.round(100 * queryLower.length / entry.length) - results.push([ + for (const [file, id, isMain] of foundEntries) { + const score = Math.round(100 * queryLower.length / entry.length); + const result = [ docNames[file], titles[file], id ? "#" + id : "", null, score, filenames[file], - ]); + SearchResultKind.index, + ]; + if (isMain) { + normalResults.push(result); + } else { + nonMainIndexResults.push(result); + } } } } // lookup as object objectTerms.forEach((term) => - results.push(...Search.performObjectSearch(term, objectTerms)) + normalResults.push(...Search.performObjectSearch(term, objectTerms)) ); // lookup as search terms in fulltext - results.push(...Search.performTermsSearch(searchTerms, excludedTerms)); + normalResults.push(...Search.performTermsSearch(searchTerms, excludedTerms)); // let the scorer override scores with a custom scoring function - if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item))); - - // now sort the results by score (in opposite order of appearance, since the - // display function below uses pop() to retrieve items) and then - // alphabetically - results.sort((a, b) => { - const leftScore = a[4]; - const rightScore = b[4]; - if (leftScore === rightScore) { - // same score: sort alphabetically - const leftTitle = a[1].toLowerCase(); - const rightTitle = b[1].toLowerCase(); - if (leftTitle === rightTitle) return 0; - return leftTitle > rightTitle ? -1 : 1; // inverted is intentional - } - return leftScore > rightScore ? 1 : -1; - }); + if (Scorer.score) { + normalResults.forEach((item) => (item[4] = Scorer.score(item))); + nonMainIndexResults.forEach((item) => (item[4] = Scorer.score(item))); + } + + // Sort each group of results by score and then alphabetically by name. + normalResults.sort(_orderResultsByScoreThenName); + nonMainIndexResults.sort(_orderResultsByScoreThenName); + + // Combine the result groups in (reverse) order. + // Non-main index entries are typically arbitrary cross-references, + // so display them after other results. + let results = [...nonMainIndexResults, ...normalResults]; // remove duplicate search results // note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept @@ -353,14 +409,19 @@ const Search = { return acc; }, []); - results = results.reverse(); + return results.reverse(); + }, + + query: (query) => { + const [searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms] = Search._parseQuery(query); + const results = Search._performSearch(searchQuery, searchTerms, excludedTerms, highlightTerms, objectTerms); // for debugging //Search.lastresults = results.slice(); // a copy // console.info("search results:", Search.lastresults); // print the results - _displayNextItem(results, results.length, searchTerms); + _displayNextItem(results, results.length, searchTerms, highlightTerms); }, /** @@ -424,6 +485,7 @@ const Search = { descr, score, filenames[match[0]], + SearchResultKind.object, ]); }; Object.keys(objects).forEach((prefix) => @@ -458,14 +520,18 @@ const Search = { // add support for partial matches if (word.length > 2) { const escapedWord = _escapeRegExp(word); - Object.keys(terms).forEach((term) => { - if (term.match(escapedWord) && !terms[word]) - arr.push({ files: terms[term], score: Scorer.partialTerm }); - }); - Object.keys(titleTerms).forEach((term) => { - if (term.match(escapedWord) && !titleTerms[word]) - arr.push({ files: titleTerms[word], score: Scorer.partialTitle }); - }); + if (!terms.hasOwnProperty(word)) { + Object.keys(terms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: terms[term], score: Scorer.partialTerm }); + }); + } + if (!titleTerms.hasOwnProperty(word)) { + Object.keys(titleTerms).forEach((term) => { + if (term.match(escapedWord)) + arr.push({ files: titleTerms[term], score: Scorer.partialTitle }); + }); + } } // no match but word was a required one @@ -488,9 +554,8 @@ const Search = { // create the mapping files.forEach((file) => { - if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1) - fileMap.get(file).push(word); - else fileMap.set(file, [word]); + if (!fileMap.has(file)) fileMap.set(file, [word]); + else if (fileMap.get(file).indexOf(word) === -1) fileMap.get(file).push(word); }); }); @@ -531,6 +596,7 @@ const Search = { null, score, filenames[file], + SearchResultKind.text, ]); } return results; @@ -541,8 +607,8 @@ const Search = { * search summary for a given text. keywords is a list * of stemmed words. */ - makeSearchSummary: (htmlText, keywords) => { - const text = Search.htmlToText(htmlText); + makeSearchSummary: (htmlText, keywords, anchor) => { + const text = Search.htmlToText(htmlText, anchor); if (text === "") return null; const textLower = text.toLowerCase(); diff --git a/_static/sphinx_highlight.js b/_static/sphinx_highlight.js index aae669d7e..8a96c69a1 100644 --- a/_static/sphinx_highlight.js +++ b/_static/sphinx_highlight.js @@ -29,14 +29,19 @@ const _highlight = (node, addItems, text, className) => { } span.appendChild(document.createTextNode(val.substr(pos, text.length))); + const rest = document.createTextNode(val.substr(pos + text.length)); parent.insertBefore( span, parent.insertBefore( - document.createTextNode(val.substr(pos + text.length)), + rest, node.nextSibling ) ); node.nodeValue = val.substr(0, pos); + /* There may be more occurrences of search term in this node. So call this + * function recursively on the remaining fragment. + */ + _highlight(rest, addItems, text, className); if (isInSVG) { const rect = document.createElementNS( @@ -140,5 +145,10 @@ const SphinxHighlight = { }, }; -_ready(SphinxHighlight.highlightSearchWords); -_ready(SphinxHighlight.initEscapeListener); +_ready(() => { + /* Do not call highlightSearchWords() when we are on the search page. + * It will highlight words from the *previous* search query. + */ + if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords(); + SphinxHighlight.initEscapeListener(); +}); diff --git a/data_juicer.analysis.html b/data_juicer.analysis.html index 338acebb5..7b6fa48f4 100644 --- a/data_juicer.analysis.html +++ b/data_juicer.analysis.html @@ -1,19 +1,19 @@ - + - data_juicer.analysis — data_juicer 1.0.0 documentation + data_juicer.analysis — data_juicer 1.0.1 documentation - - - + + + @@ -84,15 +84,15 @@
    -

    data_juicer.analysis

    +

    data_juicer.analysis

    -class data_juicer.analysis.ColumnWiseAnalysis(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]
    +class data_juicer.analysis.ColumnWiseAnalysis(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]

    Bases: object

    Apply analysis on each column of stats respectively.

    -__init__(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]
    +__init__(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]

    Initialization method

    Parameters:
    @@ -109,7 +109,7 @@
    -analyze(show_percentiles=False, show=False, skip_export=False)[source]
    +analyze(show_percentiles=False, show=False, skip_export=False)[source]

    Apply analysis and draw the analysis figure for stats.

    Parameters:
    @@ -129,7 +129,7 @@
    -draw_hist(ax, data, save_path, percentiles=None, show=False)[source]
    +draw_hist(ax, data, save_path, percentiles=None, show=False)[source]

    Draw the histogram for the data.

    Parameters:
    @@ -150,7 +150,7 @@
    -draw_box(ax, data, save_path, percentiles=None, show=False)[source]
    +draw_box(ax, data, save_path, percentiles=None, show=False)[source]

    Draw the box plot for the data.

    Parameters:
    @@ -173,13 +173,13 @@
    -class data_juicer.analysis.DiversityAnalysis(dataset, output_path, lang_or_model='en')[source]
    +class data_juicer.analysis.DiversityAnalysis(dataset, output_path, lang_or_model='en')[source]

    Bases: object

    Apply diversity analysis for each sample and get an overall analysis result.

    -__init__(dataset, output_path, lang_or_model='en')[source]
    +__init__(dataset, output_path, lang_or_model='en')[source]

    Initialization method :param dataset: the dataset to be analyzed :param output_path: path to store the analysis results :param lang_or_model: the diversity model or a specific language used to load @@ -188,7 +188,7 @@

    -compute(lang_or_model=None, column_name='text')[source]
    +compute(lang_or_model=None, column_name='text')[source]

    Apply lexical tree analysis on each sample.

    Parameters:
    @@ -206,7 +206,7 @@
    -analyze(lang_or_model=None, column_name='text', postproc_func=<function get_diversity>, **postproc_kwarg)[source]
    +analyze(lang_or_model=None, column_name='text', postproc_func=<function get_diversity>, **postproc_kwarg)[source]

    Apply diversity analysis on the whole dataset.

    Parameters:
    @@ -229,13 +229,13 @@
    -class data_juicer.analysis.OverallAnalysis(dataset, output_path)[source]
    +class data_juicer.analysis.OverallAnalysis(dataset, output_path)[source]

    Bases: object

    Apply analysis on the overall stats, including mean, std, quantiles, etc.

    -__init__(dataset, output_path)[source]
    +__init__(dataset, output_path)[source]

    Initialization method.

    Parameters:
    @@ -249,12 +249,12 @@
    -refine_single_column(col)[source]
    +refine_single_column(col)[source]
    -analyze(percentiles=[], num_proc=1, skip_export=False)[source]
    +analyze(percentiles=[], num_proc=1, skip_export=False)[source]

    Apply overall analysis on the whole dataset based on the describe method of pandas.

    diff --git a/data_juicer.config.html b/data_juicer.config.html index a9b9fb307..13d47b524 100644 --- a/data_juicer.config.html +++ b/data_juicer.config.html @@ -1,19 +1,19 @@ - + - data_juicer.config — data_juicer 1.0.0 documentation + data_juicer.config — data_juicer 1.0.1 documentation - - - + + + @@ -86,10 +86,10 @@
    -

    data_juicer.config

    +

    data_juicer.config

    -data_juicer.config.init_configs(args: List[str] | None = None)[source]
    +data_juicer.config.init_configs(args: List[str] | None = None)[source]
    initialize the jsonargparse parser and parse configs from one of:
    1. POSIX-style commands line args;

    2. @@ -111,13 +111,13 @@
      -data_juicer.config.get_init_configs(cfg: Namespace | Dict)[source]
      +data_juicer.config.get_init_configs(cfg: Namespace | Dict)[source]

      set init configs of datajucer for cfg

      -data_juicer.config.export_config(cfg: Namespace, path: str, format: str = 'yaml', skip_none: bool = True, skip_check: bool = True, overwrite: bool = False, multifile: bool = True)[source]
      +data_juicer.config.export_config(cfg: Namespace, path: str, format: str = 'yaml', skip_none: bool = True, skip_check: bool = True, overwrite: bool = False, multifile: bool = True)[source]

      Save the config object, some params are from jsonargparse

      Parameters:
      @@ -140,7 +140,7 @@
      -data_juicer.config.merge_config(ori_cfg: Namespace, new_cfg: Namespace)[source]
      +data_juicer.config.merge_config(ori_cfg: Namespace, new_cfg: Namespace)[source]

      Merge configuration from new_cfg into ori_cfg

      Parameters:
      @@ -159,7 +159,7 @@
      -data_juicer.config.prepare_side_configs(ori_config: str | Namespace | Dict)[source]
      +data_juicer.config.prepare_side_configs(ori_config: str | Namespace | Dict)[source]
      parse the config if ori_config is a string of a config file path with

      yaml, yml or json format

      diff --git a/data_juicer.core.html b/data_juicer.core.html index 6efc54d0e..3a34b74cc 100644 --- a/data_juicer.core.html +++ b/data_juicer.core.html @@ -1,19 +1,19 @@ - + - data_juicer.core — data_juicer 1.0.0 documentation + data_juicer.core — data_juicer 1.0.1 documentation - - - + + + @@ -88,24 +88,24 @@
      -

      data_juicer.core

      +

      data_juicer.core

      -class data_juicer.core.Adapter(cfg: dict)[source]
      +class data_juicer.core.Adapter(cfg: dict)[source]

      Bases: object

      -MAX_BATCH_SIZE = 10000
      +MAX_BATCH_SIZE = 10000
      -__init__(cfg: dict)[source]
      +__init__(cfg: dict)[source]
      -static execute_and_probe(dataset, operators, sample_interval=0.5)[source]
      +static execute_and_probe(dataset, operators, sample_interval=0.5)[source]

      Process the input dataset and probe related information for each OP in the specified operator list.

      For now, we support the following targets to probe: @@ -117,7 +117,7 @@

      -static take_batch(dataset, config)[source]
      +static take_batch(dataset, config)[source]

      Split the dataset into batches based on configuration and load factor.

      Parameters:
      @@ -134,7 +134,7 @@
      -adapt_workloads(dataset, operators)[source]
      +adapt_workloads(dataset, operators)[source]

      Manage the scheduling and load balancing for the dataset processing.

      Parameters:
      @@ -148,7 +148,7 @@
      -probe_small_batch(dataset, operators)[source]
      +probe_small_batch(dataset, operators)[source]

      Perform small batch pre-execution to probe available resources, current load and estimated OP speed, returning load factors and speed ranks for each OP.

      @@ -169,7 +169,7 @@
      -batch_size_strategy(load_analysis_res, base_bs=1, util_th=0.9)[source]
      +batch_size_strategy(load_analysis_res, base_bs=1, util_th=0.9)[source]

      Decide the batch size for each op according to their workload analysis result and expected utilization threshold. We need to guarantee that the resource utilization won’t exceed the threshold. Now we only @@ -182,7 +182,7 @@

      -class data_juicer.core.Analyzer(cfg: Namespace | None = None)[source]
      +class data_juicer.core.Analyzer(cfg: Namespace | None = None)[source]

      Bases: object

      This Analyzer class is used to analyze a specific dataset.

      It will compute stats for all filter ops in the config file, apply @@ -192,7 +192,7 @@ dataset better.

      -__init__(cfg: Namespace | None = None)[source]
      +__init__(cfg: Namespace | None = None)[source]

      Initialization method.

      Parameters:
      @@ -203,7 +203,7 @@
      -run(load_data_np: int[int] | None = None, skip_export: bool = False, skip_return: bool = False)[source]
      +run(load_data_np: Annotated[int, Gt(gt=0)] | None = None, skip_export: bool = False, skip_return: bool = False)[source]

      Running the dataset analysis pipeline.

      Parameters:
      @@ -223,44 +223,49 @@
      -class data_juicer.core.NestedDataset(*args, **kargs)[source]
      +class data_juicer.core.NestedDataset(*args, **kargs)[source]

      Bases: Dataset, DJDataset

      Enhanced HuggingFace-Dataset for better usability and efficiency.

      -__init__(*args, **kargs)[source]
      +__init__(*args, **kargs)[source]
      -process(operators, *, work_dir=None, exporter=None, checkpointer=None, tracer=None)[source]
      +process(operators, *, work_dir=None, exporter=None, checkpointer=None, tracer=None, open_monitor=True)[source]

      process a list of operators on the dataset.

      +
      +
      +update_args(args, kargs, is_filter=False)[source]
      +
      +
      -map(*args, **kargs)[source]
      +map(*args, **kargs)[source]

      Override the map func, which is called by most common operations, such that the processed samples can be accessed by nested manner.

      -filter(*args, **kargs)[source]
      +filter(*args, **kargs)[source]

      Override the filter func, which is called by most common operations, such that the processed samples can be accessed by nested manner.

      -select(*args, **kargs)[source]
      +select(*args, **kargs)[source]

      Override the select func, such that selected samples can be accessed by nested manner.

      -classmethod from_dict(*args, **kargs)[source]
      +classmethod from_dict(*args, **kargs)[source]

      Override the from_dict func, which is called by most from_xx constructors, such that the constructed dataset object is NestedDataset.

      @@ -268,35 +273,35 @@
      -add_column(*args, **kargs)[source]
      +add_column(*args, **kargs)[source]

      Override the add column func, such that the processed samples can be accessed by nested manner.

      -select_columns(*args, **kargs)[source]
      +select_columns(*args, **kargs)[source]

      Override the select columns func, such that the processed samples can be accessed by nested manner.

      -remove_columns(*args, **kargs)[source]
      +remove_columns(*args, **kargs)[source]

      Override the remove columns func, such that the processed samples can be accessed by nested manner.

      -cleanup_cache_files()[source]
      +cleanup_cache_files()[source]

      Override the cleanup_cache_files func, clear raw and compressed cache files.

      -static load_from_disk(*args, **kargs)[source]
      +static load_from_disk(*args, **kargs)[source]

      Loads a dataset that was previously saved using [save_to_disk] from a dataset directory, or from a filesystem using any implementation of fsspec.spec.AbstractFileSystem.

      @@ -334,14 +339,14 @@
      -class data_juicer.core.Executor(cfg: Namespace | None = None)[source]
      +class data_juicer.core.Executor(cfg: Namespace | None = None)[source]

      Bases: object

      This Executor class is used to process a specific dataset.

      It will load the dataset and unify the format, then apply all the ops in the config file in order and generate a processed dataset.

      -__init__(cfg: Namespace | None = None)[source]
      +__init__(cfg: Namespace | None = None)[source]

      Initialization method.

      Parameters:
      @@ -352,7 +357,7 @@
      -sample_data(dataset_to_sample: Dataset | None = None, load_data_np=None, sample_ratio: float = 1.0, sample_algo: str = 'uniform', **kwargs)[source]
      +sample_data(dataset_to_sample: Dataset | None = None, load_data_np=None, sample_ratio: float = 1.0, sample_algo: str = 'uniform', **kwargs)[source]

      Sample a subset from the given dataset.

      Parameters:
      @@ -376,7 +381,7 @@
      -run(load_data_np: int[int] | None = None, skip_return=False)[source]
      +run(load_data_np: Annotated[int, Gt(gt=0)] | None = None, skip_return=False)[source]

      Running the dataset process pipeline.

      Parameters:
      @@ -395,33 +400,33 @@
      -class data_juicer.core.Exporter(export_path, export_shard_size=0, export_in_parallel=True, num_proc=1, export_ds=True, keep_stats_in_res_ds=False, keep_hashes_in_res_ds=False, export_stats=True)[source]
      +class data_juicer.core.Exporter(export_path, export_shard_size=0, export_in_parallel=True, num_proc=1, export_ds=True, keep_stats_in_res_ds=False, keep_hashes_in_res_ds=False, export_stats=True)[source]

      Bases: object

      The Exporter class is used to export a dataset to files of specific format.

      -KiB = 1024
      +KiB = 1024
      -MiB = 1048576
      +MiB = 1048576
      -GiB = 1073741824
      +GiB = 1073741824
      -TiB = 1099511627776
      +TiB = 1099511627776
      -__init__(export_path, export_shard_size=0, export_in_parallel=True, num_proc=1, export_ds=True, keep_stats_in_res_ds=False, keep_hashes_in_res_ds=False, export_stats=True)[source]
      +__init__(export_path, export_shard_size=0, export_in_parallel=True, num_proc=1, export_ds=True, keep_stats_in_res_ds=False, keep_hashes_in_res_ds=False, export_stats=True)[source]

      Initialization method.

      Parameters:
      @@ -444,7 +449,7 @@
      -export(dataset)[source]
      +export(dataset)[source]

      Export method for a dataset.

      Parameters:
      @@ -458,13 +463,13 @@
      -export_compute_stats(dataset, export_path)[source]
      +export_compute_stats(dataset, export_path)[source]

      Export method for saving compute status in filters

      -static to_jsonl(dataset, export_path, num_proc=1, **kwargs)[source]
      +static to_jsonl(dataset, export_path, num_proc=1, **kwargs)[source]

      Export method for jsonl target files.

      Parameters:
      @@ -483,7 +488,7 @@
      -static to_json(dataset, export_path, num_proc=1, **kwargs)[source]
      +static to_json(dataset, export_path, num_proc=1, **kwargs)[source]

      Export method for json target files.

      Parameters:
      @@ -502,7 +507,7 @@
      -static to_parquet(dataset, export_path, **kwargs)[source]
      +static to_parquet(dataset, export_path, **kwargs)[source]

      Export method for parquet target files.

      Parameters:
      @@ -522,7 +527,7 @@
      -class data_juicer.core.Monitor[source]
      +class data_juicer.core.Monitor[source]

      Bases: object

      Monitor resource utilization and other information during the data processing.

      @@ -554,7 +559,7 @@

      ]

      -

      }

      +

      }

      Based on the structure above, the resource utilization analysis result will add several extra fields on the first level: ‘’’python @@ -576,27 +581,27 @@

      }<

    -

    }

    +

    }

    Only those fields in DYNAMIC_FIELDS will be analyzed.

    -DYNAMIC_FIELDS = {'Available mem.', 'CPU util.', 'Free mem.', 'GPU free mem.', 'GPU used mem.', 'GPU util.', 'Mem. util.', 'Used mem.'}
    +DYNAMIC_FIELDS = {'Available mem.', 'CPU util.', 'Free mem.', 'GPU free mem.', 'GPU used mem.', 'GPU util.', 'Mem. util.', 'Used mem.'}
    -__init__()[source]
    +__init__()[source]
    -monitor_all_resources()[source]
    +monitor_all_resources()[source]

    Detect the resource utilization of all distributed nodes.

    -static monitor_current_resources()[source]
    +static monitor_current_resources()[source]

    Detect the resource utilization of the current environment/machine. All data of “util.” is ratios in the range of [0.0, 1.0]. All data of “mem.” is in MB.

    @@ -604,26 +609,26 @@

    }<
    -static draw_resource_util_graph(resource_util_list, store_dir)[source]
    +static draw_resource_util_graph(resource_util_list, store_dir)[source]
    -static analyze_resource_util_list(resource_util_list)[source]
    +static analyze_resource_util_list(resource_util_list)[source]

    Analyze the resource utilization for a given resource util list. Compute {‘max’, ‘min’, ‘avg’} of resource metrics for each dict item.

    -static analyze_single_resource_util(resource_util_dict)[source]
    +static analyze_single_resource_util(resource_util_dict)[source]

    Analyze the resource utilization for a single resource util dict. Compute {‘max’, ‘min’, ‘avg’} of each resource metrics.

    -static monitor_func(func, args=None, sample_interval=0.5)[source]
    +static monitor_func(func, args=None, sample_interval=0.5)[source]

    Process the input dataset and probe related information for each OP in the specified operator list.

    For now, we support the following targets to probe: @@ -638,14 +643,14 @@

    }<
    -class data_juicer.core.Tracer(work_dir, show_num=10)[source]
    +class data_juicer.core.Tracer(work_dir, show_num=10)[source]

    Bases: object

    The tracer to trace the sample changes before and after an operator process.

    The comparison results will be stored in the work directory.

    -__init__(work_dir, show_num=10)[source]
    +__init__(work_dir, show_num=10)[source]

    Initialization method.

    Parameters:
    @@ -661,7 +666,7 @@

    }<
    -trace_mapper(op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str)[source]
    +trace_mapper(op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str)[source]

    Compare datasets before and after a Mapper.

    This will mainly show the different sample pairs due to the modification by the Mapper

    @@ -682,7 +687,7 @@

    }<
    -trace_batch_mapper(op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str)[source]
    +trace_batch_mapper(op_name: str, previous_ds: Dataset, processed_ds: Dataset, text_key: str)[source]

    Compare datasets before and after a BatchMapper.

    This will mainly show the new samples augmented by the BatchMapper

    @@ -702,7 +707,7 @@

    }<
    -trace_filter(op_name: str, previous_ds: Dataset, processed_ds: Dataset)[source]
    +trace_filter(op_name: str, previous_ds: Dataset, processed_ds: Dataset)[source]

    Compare datasets before and after a Filter.

    This will mainly show the filtered samples by the Filter

    @@ -721,7 +726,7 @@

    }<
    -trace_deduplicator(op_name: str, dup_pairs: list)[source]
    +trace_deduplicator(op_name: str, dup_pairs: list)[source]

    Compare datasets before and after a Deduplicator.

    This will mainly show the near-duplicate sample pairs extracted by the Deduplicator. Different from the other two trace methods, diff --git a/data_juicer.format.html b/data_juicer.format.html index d4f29346c..174635cce 100644 --- a/data_juicer.format.html +++ b/data_juicer.format.html @@ -1,19 +1,19 @@ - + - data_juicer.format — data_juicer 1.0.0 documentation + data_juicer.format — data_juicer 1.0.1 documentation - - - + + + @@ -91,10 +91,10 @@

    -

    data_juicer.format

    +

    data_juicer.format

    -data_juicer.format.load_formatter(dataset_path, generated_dataset_config=None, text_keys=None, suffixes=[], add_suffix=False, **kwargs) BaseFormatter[source]
    +data_juicer.format.load_formatter(dataset_path, generated_dataset_config=None, text_keys=None, suffixes=[], add_suffix=False, **kwargs) BaseFormatter[source]

    Load mixture formatter for multiple different data formats with an optional weight(default 1.0) according to their formats.

    @@ -119,18 +119,18 @@
    -class data_juicer.format.JsonFormatter(dataset_path, suffixes=None, **kwargs)[source]
    +class data_juicer.format.JsonFormatter(dataset_path, suffixes=None, **kwargs)[source]

    Bases: LocalFormatter

    The class is used to load and format json-type files.

    Default suffixes is [‘.json’, ‘.jsonl’, ‘.jsonl.zst’]

    -SUFFIXES = ['.json', '.jsonl', '.jsonl.zst']
    +SUFFIXES = ['.json', '.jsonl', '.jsonl.zst']
    -__init__(dataset_path, suffixes=None, **kwargs)[source]
    +__init__(dataset_path, suffixes=None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -147,13 +147,13 @@
    -class data_juicer.format.LocalFormatter(dataset_path: str, type: str, suffixes: str | List[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]
    +class data_juicer.format.LocalFormatter(dataset_path: str, type: str, suffixes: str | List[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]

    Bases: BaseFormatter

    The class is used to load a dataset from local files or local directory.

    -__init__(dataset_path: str, type: str, suffixes: str | List[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]
    +__init__(dataset_path: str, type: str, suffixes: str | List[str] | None = None, text_keys: List[str] | None = None, add_suffix=False, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -174,7 +174,7 @@
    -load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]
    +load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]

    Load a dataset from dataset file or dataset directory, and unify its format.

    @@ -194,13 +194,13 @@
    -class data_juicer.format.RemoteFormatter(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]
    +class data_juicer.format.RemoteFormatter(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]

    Bases: BaseFormatter

    The class is used to load a dataset from repository of huggingface hub.

    -__init__(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]
    +__init__(dataset_path: str, text_keys: List[str] | None = None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -216,7 +216,7 @@
    -load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]
    +load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]

    Load a dataset from HuggingFace, and unify its format.

    Parameters:
    @@ -235,18 +235,18 @@
    -class data_juicer.format.TextFormatter(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]
    +class data_juicer.format.TextFormatter(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]

    Bases: LocalFormatter

    The class is used to load and format text-type files.

    e.g. [‘.txt’, ‘.pdf’, ‘.cpp’, ‘.docx’]

    -SUFFIXES = ['.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', '.m', '.smali']
    +SUFFIXES = ['.docx', '.pdf', '.txt', '.md', '.tex', '.asm', '.bat', '.cmd', '.c', '.h', '.cs', '.cpp', '.hpp', '.c++', '.h++', '.cc', '.hh', '.C', '.H', '.cmake', '.css', '.dockerfile', '.f90', '.f', '.f03', '.f08', '.f77', '.f95', '.for', '.fpp', '.go', '.hs', '.html', '.java', '.js', '.jl', '.lua', '.markdown', '.php', '.php3', '.php4', '.php5', '.phps', '.phpt', '.pl', '.pm', '.pod', '.perl', '.ps1', '.psd1', '.psm1', '.py', '.rb', '.rs', '.sql', '.scala', '.sh', '.bash', '.command', '.zsh', '.ts', '.tsx', '.vb', 'Dockerfile', 'Makefile', '.xml', '.rst', '.m', '.smali']
    -__init__(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]
    +__init__(dataset_path, suffixes=None, add_suffix=False, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -263,7 +263,7 @@
    -load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]
    +load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]

    Load a dataset from local text-type files.

    Parameters:
    @@ -282,18 +282,18 @@
    -class data_juicer.format.ParquetFormatter(dataset_path, suffixes=None, **kwargs)[source]
    +class data_juicer.format.ParquetFormatter(dataset_path, suffixes=None, **kwargs)[source]

    Bases: LocalFormatter

    The class is used to load and format parquet-type files.

    Default suffixes is [‘.parquet’]

    -SUFFIXES = ['.parquet']
    +SUFFIXES = ['.parquet']
    -__init__(dataset_path, suffixes=None, **kwargs)[source]
    +__init__(dataset_path, suffixes=None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -310,18 +310,18 @@
    -class data_juicer.format.CsvFormatter(dataset_path, suffixes=None, **kwargs)[source]
    +class data_juicer.format.CsvFormatter(dataset_path, suffixes=None, **kwargs)[source]

    Bases: LocalFormatter

    The class is used to load and format csv-type files.

    Default suffixes is [‘.csv’]

    -SUFFIXES = ['.csv']
    +SUFFIXES = ['.csv']
    -__init__(dataset_path, suffixes=None, **kwargs)[source]
    +__init__(dataset_path, suffixes=None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -338,18 +338,18 @@
    -class data_juicer.format.TsvFormatter(dataset_path, suffixes=None, **kwargs)[source]
    +class data_juicer.format.TsvFormatter(dataset_path, suffixes=None, **kwargs)[source]

    Bases: LocalFormatter

    The class is used to load and format tsv-type files.

    Default suffixes is [‘.tsv’]

    -SUFFIXES = ['.tsv']
    +SUFFIXES = ['.tsv']
    -__init__(dataset_path, suffixes=None, **kwargs)[source]
    +__init__(dataset_path, suffixes=None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -366,14 +366,14 @@
    -class data_juicer.format.MixtureFormatter(dataset_path: str, suffixes: str | List[str] | None = None, text_keys=None, add_suffix=False, max_samples=None, **kwargs)[source]
    +class data_juicer.format.MixtureFormatter(dataset_path: str, suffixes: str | List[str] | None = None, text_keys=None, add_suffix=False, max_samples=None, **kwargs)[source]

    Bases: BaseFormatter

    The class mixes multiple datasets by randomly selecting samples from every dataset and merging them, and then exports the merged datasset as a new mixed dataset.

    -__init__(dataset_path: str, suffixes: str | List[str] | None = None, text_keys=None, add_suffix=False, max_samples=None, **kwargs)[source]
    +__init__(dataset_path: str, suffixes: str | List[str] | None = None, text_keys=None, add_suffix=False, max_samples=None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -394,7 +394,7 @@
    -classmethod random_sample(dataset, weight=1.0, sample_number=0, seed=None)[source]
    +classmethod random_sample(dataset, weight=1.0, sample_number=0, seed=None)[source]

    Randomly sample a subset from a dataset with weight or number, if sample number is bigger than 0, we will use sample number instead of weight. @@ -407,7 +407,7 @@

    -load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]
    +load_dataset(num_proc: int = 1, global_cfg=None) Dataset[source]

    Load a mixed dataset.

    Parameters:
    @@ -426,17 +426,17 @@
    -class data_juicer.format.EmptyFormatter(length, feature_keys: List[str] = [], *args, **kwargs)[source]
    +class data_juicer.format.EmptyFormatter(length, feature_keys: List[str] = [], *args, **kwargs)[source]

    Bases: BaseFormatter

    The class is used to create empty data.

    -SUFFIXES = []
    +SUFFIXES = []
    -__init__(length, feature_keys: List[str] = [], *args, **kwargs)[source]
    +__init__(length, feature_keys: List[str] = [], *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -450,29 +450,29 @@
    -property null_value
    +property null_value
    -load_dataset(*args, **kwargs)[source]
    +load_dataset(*args, **kwargs)[source]
    -class data_juicer.format.RayEmptyFormatter(length, feature_keys: List[str] = [], *args, **kwargs)[source]
    +class data_juicer.format.RayEmptyFormatter(length, feature_keys: List[str] = [], *args, **kwargs)[source]

    Bases: BaseFormatter

    The class is used to create empty data for ray.

    -SUFFIXES = []
    +SUFFIXES = []
    -__init__(length, feature_keys: List[str] = [], *args, **kwargs)[source]
    +__init__(length, feature_keys: List[str] = [], *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -486,12 +486,12 @@
    -property null_value
    +property null_value
    -load_dataset(*args, **kwargs)[source]
    +load_dataset(*args, **kwargs)[source]
    diff --git a/data_juicer.html b/data_juicer.html index 38c49c70b..8ba2ce5b5 100644 --- a/data_juicer.html +++ b/data_juicer.html @@ -1,19 +1,19 @@ - + - data_juicer — data_juicer 1.0.0 documentation + data_juicer — data_juicer 1.0.1 documentation - - - + + + @@ -77,15 +77,15 @@
    -

    data_juicer

    +

    data_juicer

    -data_juicer.cuda_device_count()[source]
    +data_juicer.cuda_device_count()[source]
    -data_juicer.is_cuda_available()[source]
    +data_juicer.is_cuda_available()[source]
    diff --git a/data_juicer.ops.common.html b/data_juicer.ops.common.html index ec4396ee0..94f937084 100644 --- a/data_juicer.ops.common.html +++ b/data_juicer.ops.common.html @@ -1,19 +1,19 @@ - + - data_juicer.ops.common — data_juicer 1.0.0 documentation + data_juicer.ops.common — data_juicer 1.0.1 documentation - - - + + + @@ -90,10 +90,10 @@
    -

    data_juicer.ops.common

    +

    data_juicer.ops.common

    -data_juicer.ops.common.get_sentences_from_document(document, model_func=None)[source]
    +data_juicer.ops.common.get_sentences_from_document(document, model_func=None)[source]

    Get sentences from a document.

    Parameters:
    @@ -112,7 +112,7 @@
    -data_juicer.ops.common.get_words_from_document(document, token_func=None, new_line=True, tab=True)[source]
    +data_juicer.ops.common.get_words_from_document(document, token_func=None, new_line=True, tab=True)[source]

    Get words from a document. Useful to compute ratios, like the stopwords ratio.

    @@ -133,7 +133,7 @@
    -data_juicer.ops.common.merge_on_whitespace_tab_newline(sentences)[source]
    +data_juicer.ops.common.merge_on_whitespace_tab_newline(sentences)[source]

    This method is used to merge different levels of sub-sentences into one document. Invert the method split_on_newline_tab_whitespace. Removes concatenated separators.

    @@ -149,7 +149,7 @@
    -data_juicer.ops.common.split_on_newline_tab_whitespace(document)[source]
    +data_juicer.ops.common.split_on_newline_tab_whitespace(document)[source]

    This method is used to split the document into different levels of sub- sentences.

    First split on “\n”, then on “\t”, then on “ “. @@ -159,7 +159,7 @@

    -data_juicer.ops.common.split_on_whitespace(document, new_line=False, tab=False)[source]
    +data_juicer.ops.common.split_on_whitespace(document, new_line=False, tab=False)[source]

    This method also removes concatenated spaces.

    Parameters:
    @@ -177,7 +177,7 @@
    -data_juicer.ops.common.strip(document, strip_characters)[source]
    +data_juicer.ops.common.strip(document, strip_characters)[source]

    Way faster than document.strip(strip_characters) since strip_characters is now a set instead of a str, and it contains a lot of elements (all the emojis).

    @@ -196,7 +196,7 @@
    -data_juicer.ops.common.words_augmentation(words, group_size, join_char)[source]
    +data_juicer.ops.common.words_augmentation(words, group_size, join_char)[source]

    Augment words, especially for Chinese (without a space between words) and Vietnamese (with a space between syllables).

    @@ -215,7 +215,7 @@
    -data_juicer.ops.common.words_refinement(words, lower_case=False, strip_chars=None, use_words_aug=False, words_aug_group_sizes=[2], words_aug_join_char='')[source]
    +data_juicer.ops.common.words_refinement(words, lower_case=False, strip_chars=None, use_words_aug=False, words_aug_group_sizes=[2], words_aug_join_char='')[source]

    Refine split words. Non reversible since the document is split on multiple characters, words are stripped of special characters and characters are converted to lower case.

    @@ -240,7 +240,7 @@
    -data_juicer.ops.common.split_text_by_punctuation(text)[source]
    +data_juicer.ops.common.split_text_by_punctuation(text)[source]

    Split text by any zh and en punctuation

    Parameters:
    diff --git a/data_juicer.ops.deduplicator.html b/data_juicer.ops.deduplicator.html index 632dd0df3..264a47c99 100644 --- a/data_juicer.ops.deduplicator.html +++ b/data_juicer.ops.deduplicator.html @@ -1,19 +1,19 @@ - + - data_juicer.ops.deduplicator — data_juicer 1.0.0 documentation + data_juicer.ops.deduplicator — data_juicer 1.0.1 documentation - - - + + + @@ -90,16 +90,16 @@
    -

    data_juicer.ops.deduplicator

    +

    data_juicer.ops.deduplicator

    -class data_juicer.ops.deduplicator.DocumentDeduplicator(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.DocumentDeduplicator(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]

    Bases: Deduplicator

    Deduplicator to deduplicate samples at document-level using exact matching.

    Using md5 hash to deduplicate samples.

    -__init__(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +__init__(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -116,7 +116,7 @@
    -compute_hash(sample)[source]
    +compute_hash(sample)[source]

    Compute md5 hash values for the sample.

    Parameters:
    @@ -130,7 +130,7 @@
    -process(dataset, show_num=0)[source]
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -150,14 +150,14 @@
    -class data_juicer.ops.deduplicator.DocumentMinhashDeduplicator(tokenization: str = 'space', window_size: int[int] = 5, lowercase: bool = True, ignore_pattern: str | None = None, num_permutations: int[int] = 256, jaccard_threshold: float[float] = 0.7, num_bands: int[int] | None = None, num_rows_per_band: int[int] | None = None, tokenizer_model: str | None = None, *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.DocumentMinhashDeduplicator(tokenization: str = 'space', window_size: Annotated[int, Gt(gt=0)] = 5, lowercase: bool = True, ignore_pattern: str | None = None, num_permutations: Annotated[int, Gt(gt=0)] = 256, jaccard_threshold: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] = 0.7, num_bands: Annotated[int, Gt(gt=0)] | None = None, num_rows_per_band: Annotated[int, Gt(gt=0)] | None = None, tokenizer_model: str | None = None, *args, **kwargs)[source]

    Bases: Deduplicator

    Deduplicator to deduplicate samples at document-level using MinHashLSH.

    Different from simhash, minhash is stored as bytes, so they won’t be kept in the final dataset.

    -__init__(tokenization: str = 'space', window_size: int[int] = 5, lowercase: bool = True, ignore_pattern: str | None = None, num_permutations: int[int] = 256, jaccard_threshold: float[float] = 0.7, num_bands: int[int] | None = None, num_rows_per_band: int[int] | None = None, tokenizer_model: str | None = None, *args, **kwargs)[source]
    +__init__(tokenization: str = 'space', window_size: Annotated[int, Gt(gt=0)] = 5, lowercase: bool = True, ignore_pattern: str | None = None, num_permutations: Annotated[int, Gt(gt=0)] = 256, jaccard_threshold: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] = 0.7, num_bands: Annotated[int, Gt(gt=0)] | None = None, num_rows_per_band: Annotated[int, Gt(gt=0)] | None = None, tokenizer_model: str | None = None, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -196,7 +196,7 @@
    -compute_hash(sample)[source]
    +compute_hash(sample)[source]

    Compute minhash values for the sample.

    Parameters:
    @@ -210,7 +210,7 @@
    -process(dataset, show_num=0)[source]
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -230,12 +230,12 @@
    -class data_juicer.ops.deduplicator.DocumentSimhashDeduplicator(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.DocumentSimhashDeduplicator(tokenization: str = 'space', window_size: Annotated[int, Gt(gt=0)] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: Annotated[int, Gt(gt=0)] = 6, hamming_distance: Annotated[int, Gt(gt=0)] = 4, *args, **kwargs)[source]

    Bases: Deduplicator

    Deduplicator to deduplicate samples at document-level using SimHash.

    -__init__(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]
    +__init__(tokenization: str = 'space', window_size: Annotated[int, Gt(gt=0)] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: Annotated[int, Gt(gt=0)] = 6, hamming_distance: Annotated[int, Gt(gt=0)] = 4, *args, **kwargs)[source]

    Initialization method :param tokenization: tokenization method for sample texts.

    It should be one of [space, punctuation, character]. For @@ -262,7 +262,7 @@

    -compute_hash(sample)[source]
    +compute_hash(sample)[source]

    Compute simhash values for the sample.

    Parameters:
    @@ -276,7 +276,7 @@
    -process(dataset, show_num=0)[source]
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -296,13 +296,13 @@
    -class data_juicer.ops.deduplicator.ImageDeduplicator(method: str = 'phash', consider_text: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.ImageDeduplicator(method: str = 'phash', consider_text: bool = False, *args, **kwargs)[source]

    Bases: Deduplicator

    Deduplicator to deduplicate samples at document-level using exact matching of images between documents.

    -__init__(method: str = 'phash', consider_text: bool = False, *args, **kwargs)[source]
    +__init__(method: str = 'phash', consider_text: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -319,7 +319,7 @@
    -compute_hash(sample, context=False)[source]
    +compute_hash(sample, context=False)[source]

    Compute hash values for the sample.

    Parameters:
    @@ -333,7 +333,7 @@
    -process(dataset, show_num=0)[source]
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -353,19 +353,19 @@
    -class data_juicer.ops.deduplicator.RayBasicDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.RayBasicDeduplicator(redis_host: str = 'localhost', redis_port: Annotated[int, Gt(gt=0)] = 6380, *args, **kwargs)[source]

    Bases: Filter

    A basic exact matching deduplicator for RAY. Although its functionality is deduplication, it is implemented as Filter sub-class.

    -EMPTY_HASH_VALUE = 'EMPTY'
    +EMPTY_HASH_VALUE = 'EMPTY'
    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    +__init__(redis_host: str = 'localhost', redis_port: Annotated[int, Gt(gt=0)] = 6380, *args, **kwargs)[source]

    Initialization. :param redis_host: the hostname of redis server :param redis_port: the port of redis server @@ -375,13 +375,13 @@

    -calculate_hash(sample, context=False)[source]
    +calculate_hash(sample, context=False)[source]

    Calculate hash value for the sample.

    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -400,7 +400,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -416,12 +416,12 @@
    -class data_juicer.ops.deduplicator.RayDocumentDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.RayDocumentDeduplicator(redis_host: str = 'localhost', redis_port: Annotated[int, Gt(gt=0)] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]

    Bases: RayBasicDeduplicator

    Deduplicator to deduplicate samples at document-level using exact matching.

    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +__init__(redis_host: str = 'localhost', redis_port: Annotated[int, Gt(gt=0)] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]

    Initialization method. :param redis_host: the hostname of redis server :param redis_port: the port of redis server @@ -434,7 +434,7 @@

    -calculate_hash(sample, context=False)[source]
    +calculate_hash(sample, context=False)[source]

    Calculate hash value for the sample.

    @@ -442,13 +442,13 @@
    -class data_juicer.ops.deduplicator.RayImageDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.RayImageDeduplicator(redis_host: str = 'localhost', redis_port: Annotated[int, Gt(gt=0)] = 6380, method: str = 'phash', *args, **kwargs)[source]

    Bases: RayBasicDeduplicator

    Deduplicator to deduplicate samples at document-level using exact matching of images between documents.

    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    +__init__(redis_host: str = 'localhost', redis_port: Annotated[int, Gt(gt=0)] = 6380, method: str = 'phash', *args, **kwargs)[source]

    Initialization. :param redis_host: the hostname of redis server :param redis_port: the port of redis server @@ -458,7 +458,7 @@

    -calculate_hash(sample, context=False)[source]
    +calculate_hash(sample, context=False)[source]

    Calculate hash value for the sample.

    @@ -466,13 +466,13 @@
    -class data_juicer.ops.deduplicator.RayVideoDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.RayVideoDeduplicator(redis_host: str = 'localhost', redis_port: Annotated[int, Gt(gt=0)] = 6380, *args, **kwargs)[source]

    Bases: RayBasicDeduplicator

    Deduplicator to deduplicate samples at document-level using exact matching of videos between documents.

    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    +__init__(redis_host: str = 'localhost', redis_port: Annotated[int, Gt(gt=0)] = 6380, *args, **kwargs)[source]

    Initialization. :param redis_host: the hostname of redis server :param redis_port: the port of redis server @@ -482,7 +482,7 @@

    -calculate_hash(sample, context=False)[source]
    +calculate_hash(sample, context=False)[source]

    Calculate hash value for the sample.

    @@ -490,13 +490,13 @@
    -class data_juicer.ops.deduplicator.VideoDeduplicator(consider_text: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.deduplicator.VideoDeduplicator(consider_text: bool = False, *args, **kwargs)[source]

    Bases: Deduplicator

    Deduplicator to deduplicate samples at document-level using exact matching of videos between documents.

    -__init__(consider_text: bool = False, *args, **kwargs)[source]
    +__init__(consider_text: bool = False, *args, **kwargs)[source]

    Initialization.

    Parameters:
    @@ -512,7 +512,7 @@
    -compute_hash(sample, context=False)[source]
    +compute_hash(sample, context=False)[source]

    Compute hash values for the sample.

    Parameters:
    @@ -526,7 +526,7 @@
    -process(dataset, show_num=0)[source]
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    diff --git a/data_juicer.ops.filter.html b/data_juicer.ops.filter.html index 8e43000eb..105f2aafd 100644 --- a/data_juicer.ops.filter.html +++ b/data_juicer.ops.filter.html @@ -1,19 +1,19 @@ - + - data_juicer.ops.filter — data_juicer 1.0.0 documentation + data_juicer.ops.filter — data_juicer 1.0.1 documentation - - - + + + @@ -125,16 +125,16 @@
    -

    data_juicer.ops.filter

    +

    data_juicer.ops.filter

    -class data_juicer.ops.filter.AlphanumericFilter(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]
    +class data_juicer.ops.filter.AlphanumericFilter(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with alphabet/numeric ratio within a specific range.

    -__init__(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]
    +__init__(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -158,24 +158,24 @@
    -compute_stats_batched(samples)[source]
    +compute_stats_batched(samples)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.filter.AudioDurationFilter(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.AudioDurationFilter(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Keep data samples whose audios’ durations are within a specified range.

    -__init__(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -197,7 +197,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -216,7 +216,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -232,13 +232,13 @@
    -class data_juicer.ops.filter.AudioNMFSNRFilter(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.AudioNMFSNRFilter(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: Annotated[int, Gt(gt=0)] = 500, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Keep data samples whose audios’ SNRs (computed based on NMF) are within a specified range.

    -__init__(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: Annotated[int, Gt(gt=0)] = 500, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -262,7 +262,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -281,7 +281,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -297,13 +297,13 @@
    -class data_juicer.ops.filter.AudioSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.AudioSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Keep data samples whose audio size (in bytes/kb/MB/…) within a specific range.

    -__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -325,7 +325,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -344,7 +344,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -360,13 +360,13 @@
    -class data_juicer.ops.filter.AverageLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +class data_juicer.ops.filter.AverageLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with average line length within a specific range.

    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -386,25 +386,25 @@
    -compute_stats_batched(samples, context=False)[source]
    +compute_stats_batched(samples, context=False)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.filter.CharacterRepetitionFilter(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]
    +class data_juicer.ops.filter.CharacterRepetitionFilter(rep_len: Annotated[int, Gt(gt=0)] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with char-level n-gram repetition ratio within a specific range.

    -__init__(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]
    +__init__(rep_len: Annotated[int, Gt(gt=0)] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -425,25 +425,25 @@
    -compute_stats_batched(samples)[source]
    +compute_stats_batched(samples)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.filter.FlaggedWordFilter(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +class data_juicer.ops.filter.FlaggedWordFilter(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[Annotated[int, Gt(gt=0)]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with flagged-word ratio less than a specific max value.

    -__init__(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +__init__(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[Annotated[int, Gt(gt=0)]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -470,7 +470,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -489,7 +489,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -505,12 +505,12 @@
    -class data_juicer.ops.filter.ImageAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with aesthetics scores within a specific range.

    -__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -534,7 +534,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -553,7 +553,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -569,13 +569,13 @@
    -class data_juicer.ops.filter.ImageAspectRatioFilter(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageAspectRatioFilter(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with image aspect ratio within a specific range. AspectRatio = W / H.

    -__init__(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -595,7 +595,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -614,7 +614,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -630,12 +630,12 @@
    -class data_juicer.ops.filter.ImageFaceCountFilter(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageFaceCountFilter(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with the number of faces within a specific range.

    -__init__(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -657,7 +657,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -676,7 +676,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -692,12 +692,12 @@
    -class data_juicer.ops.filter.ImageFaceRatioFilter(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageFaceRatioFilter(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with face area ratios within a specific range.

    -__init__(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -719,7 +719,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -738,7 +738,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -754,12 +754,12 @@
    -class data_juicer.ops.filter.ImageNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples whose images have low nsfw scores.

    -__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -781,7 +781,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -800,7 +800,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -816,13 +816,13 @@
    -class data_juicer.ops.filter.ImagePairSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImagePairSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep image pairs with similarities between images within a specific range.

    -__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -844,7 +844,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -863,7 +863,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -879,12 +879,12 @@
    -class data_juicer.ops.filter.ImageShapeFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageShapeFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with image shape (w, h) within specific ranges.

    -__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -906,7 +906,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -925,7 +925,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -941,13 +941,13 @@
    -class data_juicer.ops.filter.ImageSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Keep data samples whose image size (in Bytes/KB/MB/…) within a specific range.

    -__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -969,7 +969,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -988,7 +988,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1004,13 +1004,13 @@
    -class data_juicer.ops.filter.ImageTextMatchingFilter(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageTextMatchingFilter(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples those matching score between image and text within a specific range.

    -__init__(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +__init__(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1039,7 +1039,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1058,7 +1058,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1074,13 +1074,13 @@
    -class data_juicer.ops.filter.ImageTextSimilarityFilter(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageTextSimilarityFilter(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples those similarities between image and text within a specific range.

    -__init__(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +__init__(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1109,7 +1109,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1128,7 +1128,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1144,13 +1144,13 @@
    -class data_juicer.ops.filter.ImageWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.ImageWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples whose images have no watermark with high probability.

    -__init__(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1173,7 +1173,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1192,7 +1192,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1208,13 +1208,13 @@
    -class data_juicer.ops.filter.LanguageIDScoreFilter(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]
    +class data_juicer.ops.filter.LanguageIDScoreFilter(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples in a specific language with confidence score larger than a specific min value.

    -__init__(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]
    +__init__(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1231,7 +1231,7 @@
    -compute_stats_single(sample)[source]
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1250,7 +1250,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1266,13 +1266,13 @@
    -class data_juicer.ops.filter.MaximumLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +class data_juicer.ops.filter.MaximumLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with maximum line length within a specific range.

    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1292,25 +1292,25 @@
    -compute_stats_batched(samples, context=False)[source]
    +compute_stats_batched(samples, context=False)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.filter.PerplexityFilter(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +class data_juicer.ops.filter.PerplexityFilter(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with perplexity score less than a specific max value.

    -__init__(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +__init__(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1327,25 +1327,25 @@
    -compute_stats_batched(samples, context=False)[source]
    +compute_stats_batched(samples, context=False)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.filter.PhraseGroundingRecallFilter(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +class data_juicer.ops.filter.PhraseGroundingRecallFilter(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples whose locating recalls of phrases extracted from text in the images are within a specified range.

    -__init__(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +__init__(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1385,7 +1385,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1404,7 +1404,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1420,13 +1420,13 @@
    -class data_juicer.ops.filter.SpecialCharactersFilter(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]
    +class data_juicer.ops.filter.SpecialCharactersFilter(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with special-char ratio within a specific range.

    -__init__(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]
    +__init__(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1446,26 +1446,26 @@
    -compute_stats_batched(samples)[source]
    +compute_stats_batched(samples)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.filter.SpecifiedFieldFilter(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +class data_juicer.ops.filter.SpecifiedFieldFilter(field_key: str = '', target_value: List = [], *args, **kwargs)[source]

    Bases: Filter

    Filter based on specified field information.

    If the specified field information in the sample is not within the specified target value, the sample will be filtered.

    -__init__(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +__init__(field_key: str = '', target_value: List = [], *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1485,7 +1485,7 @@
    -compute_stats_single(sample)[source]
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1504,7 +1504,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1520,14 +1520,14 @@
    -class data_juicer.ops.filter.SpecifiedNumericFieldFilter(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]
    +class data_juicer.ops.filter.SpecifiedNumericFieldFilter(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    Filter based on specified numeric field information.

    If the specified numeric information in the sample is not within the specified range, the sample will be filtered.

    -__init__(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]
    +__init__(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1551,7 +1551,7 @@
    -compute_stats_single(sample)[source]
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1570,7 +1570,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1586,13 +1586,13 @@
    -class data_juicer.ops.filter.StopWordsFilter(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +class data_juicer.ops.filter.StopWordsFilter(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[Annotated[int, Gt(gt=0)]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with stopword ratio larger than a specific min value.

    -__init__(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +__init__(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[Annotated[int, Gt(gt=0)]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1618,7 +1618,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1637,7 +1637,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1653,12 +1653,12 @@
    -class data_juicer.ops.filter.SuffixFilter(suffixes: str | List[str] = [], *args, **kwargs)[source]
    +class data_juicer.ops.filter.SuffixFilter(suffixes: str | List[str] = [], *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with specified suffix.

    -__init__(suffixes: str | List[str] = [], *args, **kwargs)[source]
    +__init__(suffixes: str | List[str] = [], *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1674,7 +1674,7 @@
    -compute_stats_single(sample)[source]
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1693,7 +1693,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1709,12 +1709,12 @@
    -class data_juicer.ops.filter.TextActionFilter(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]
    +class data_juicer.ops.filter.TextActionFilter(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep texts those contain actions in the text.

    -__init__(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]
    +__init__(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1731,7 +1731,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1750,7 +1750,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1766,13 +1766,13 @@
    -class data_juicer.ops.filter.TextEntityDependencyFilter(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]
    +class data_juicer.ops.filter.TextEntityDependencyFilter(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]

    Bases: Filter

    Identify the entities in the text which are independent with other token, and filter them. The text containing no entities will be omitted.

    -__init__(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]
    +__init__(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1792,7 +1792,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1811,7 +1811,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1827,13 +1827,13 @@
    -class data_juicer.ops.filter.TextLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +class data_juicer.ops.filter.TextLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with total text length within a specific range.

    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1853,25 +1853,25 @@
    -compute_stats_batched(samples)[source]
    +compute_stats_batched(samples)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.filter.TokenNumFilter(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +class data_juicer.ops.filter.TokenNumFilter(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with total token number within a specific range.

    -__init__(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +__init__(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1892,7 +1892,7 @@
    -compute_stats_single(sample)[source]
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1911,7 +1911,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1927,13 +1927,13 @@
    -class data_juicer.ops.filter.VideoAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: Annotated[int, Gt(gt=0)] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep data samples with aesthetics scores for specified frames in the videos within a specific range.

    -__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: Annotated[int, Gt(gt=0)] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1976,7 +1976,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1995,7 +1995,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2011,13 +2011,13 @@
    -class data_juicer.ops.filter.VideoAspectRatioFilter(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoAspectRatioFilter(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with video aspect ratio within a specific range. AspectRatio = W / H.

    -__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2039,7 +2039,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2058,7 +2058,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2074,12 +2074,12 @@
    -class data_juicer.ops.filter.VideoDurationFilter(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoDurationFilter(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Keep data samples whose videos’ durations are within a specified range.

    -__init__(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2101,7 +2101,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2120,7 +2120,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2136,13 +2136,13 @@
    -class data_juicer.ops.filter.VideoFramesTextSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoFramesTextSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples those similarities between sampled video frame images and text within a specific range.

    -__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2186,7 +2186,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2205,7 +2205,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2221,13 +2221,13 @@
    -class data_juicer.ops.filter.VideoMotionScoreFilter(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, divisible: int[int] = 1, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoMotionScoreFilter(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: Annotated[float, Gt(gt=0)] = 2, size: Annotated[int, Gt(gt=0)] | Tuple[Annotated[int, Gt(gt=0)]] | Tuple[Annotated[int, Gt(gt=0)], Annotated[int, Gt(gt=0)]] | None = None, max_size: Annotated[int, Gt(gt=0)] | None = None, divisible: Annotated[int, Gt(gt=0)] = 1, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with video motion scores within a specific range. The Farneback’s algorith from OpenCV is used to compute dense optical flow.

    -__init__(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, divisible: int[int] = 1, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: Annotated[float, Gt(gt=0)] = 2, size: Annotated[int, Gt(gt=0)] | Tuple[Annotated[int, Gt(gt=0)]] | Tuple[Annotated[int, Gt(gt=0)], Annotated[int, Gt(gt=0)]] | None = None, max_size: Annotated[int, Gt(gt=0)] | None = None, divisible: Annotated[int, Gt(gt=0)] = 1, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2262,17 +2262,17 @@
    -setup_model(rank=None)[source]
    +setup_model(rank=None)[source]
    -compute_flow(prev_frame, curr_frame)[source]
    +compute_flow(prev_frame, curr_frame)[source]
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2291,7 +2291,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2307,7 +2307,7 @@
    -class data_juicer.ops.filter.VideoMotionScoreRaftFilter(min_score: float = 1.0, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, divisible: int[int] = 8, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoMotionScoreRaftFilter(min_score: float = 1.0, max_score: float = 1.7976931348623157e+308, sampling_fps: Annotated[float, Gt(gt=0)] = 2, size: Annotated[int, Gt(gt=0)] | Tuple[Annotated[int, Gt(gt=0)]] | Tuple[Annotated[int, Gt(gt=0)], Annotated[int, Gt(gt=0)]] | None = None, max_size: Annotated[int, Gt(gt=0)] | None = None, divisible: Annotated[int, Gt(gt=0)] = 8, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: VideoMotionScoreFilter

    Filter to keep samples with video motion scores within a specified range. This operator utilizes the RAFT (Recurrent All-Pairs Field Transforms) @@ -2318,7 +2318,7 @@ https://arxiv.org/abs/2003.12039

    -__init__(min_score: float = 1.0, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, divisible: int[int] = 8, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_score: float = 1.0, max_score: float = 1.7976931348623157e+308, sampling_fps: Annotated[float, Gt(gt=0)] = 2, size: Annotated[int, Gt(gt=0)] | Tuple[Annotated[int, Gt(gt=0)]] | Tuple[Annotated[int, Gt(gt=0)], Annotated[int, Gt(gt=0)]] | None = None, max_size: Annotated[int, Gt(gt=0)] | None = None, divisible: Annotated[int, Gt(gt=0)] = 8, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2353,24 +2353,24 @@
    -setup_model(rank=None)[source]
    +setup_model(rank=None)[source]
    -compute_flow(prev_frame, curr_frame)[source]
    +compute_flow(prev_frame, curr_frame)[source]
    -class data_juicer.ops.filter.VideoNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples whose videos have low nsfw scores.

    -__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2409,7 +2409,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2428,7 +2428,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2444,13 +2444,13 @@
    -class data_juicer.ops.filter.VideoOcrAreaRatioFilter(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoOcrAreaRatioFilter(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: Annotated[int, Gt(gt=0)] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Keep data samples whose detected text area ratios for specified frames in the video are within a specified range.

    -__init__(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: Annotated[int, Gt(gt=0)] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2480,12 +2480,12 @@
    -get_reader(rank)[source]
    +get_reader(rank)[source]
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2504,7 +2504,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2520,12 +2520,12 @@
    -class data_juicer.ops.filter.VideoResolutionFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoResolutionFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Keep data samples whose videos’ resolutions are within a specified range.

    -__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2547,7 +2547,7 @@
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2566,7 +2566,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2582,12 +2582,12 @@
    -class data_juicer.ops.filter.VideoTaggingFromFramesFilter(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoTaggingFromFramesFilter(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples whose videos contain the given tags.

    -__init__(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2625,7 +2625,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2644,7 +2644,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2660,13 +2660,13 @@
    -class data_juicer.ops.filter.VideoWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.filter.VideoWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples whose videos have no watermark with high probability.

    -__init__(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2706,7 +2706,7 @@
    -compute_stats_single(sample, rank=None, context=False)[source]
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2725,7 +2725,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2741,13 +2741,13 @@
    -class data_juicer.ops.filter.WordRepetitionFilter(lang: str = 'en', tokenization: bool = False, rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]
    +class data_juicer.ops.filter.WordRepetitionFilter(lang: str = 'en', tokenization: bool = False, rep_len: Annotated[int, Gt(gt=0)] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with word-level n-gram repetition ratio within a specific range.

    -__init__(lang: str = 'en', tokenization: bool = False, rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]
    +__init__(lang: str = 'en', tokenization: bool = False, rep_len: Annotated[int, Gt(gt=0)] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2770,25 +2770,25 @@
    -compute_stats_batched(samples, context=False)[source]
    +compute_stats_batched(samples, context=False)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.filter.WordsNumFilter(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +class data_juicer.ops.filter.WordsNumFilter(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    Filter to keep samples with total words number within a specific range.

    -__init__(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +__init__(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2810,12 +2810,12 @@
    -compute_stats_batched(samples, context=False)[source]
    +compute_stats_batched(samples, context=False)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    diff --git a/data_juicer.ops.html b/data_juicer.ops.html index bf0ac806e..f54f88003 100644 --- a/data_juicer.ops.html +++ b/data_juicer.ops.html @@ -1,19 +1,19 @@ - + - data_juicer.ops — data_juicer 1.0.0 documentation + data_juicer.ops — data_juicer 1.0.1 documentation - - - + + + @@ -86,10 +86,10 @@
    -

    data_juicer.ops

    +

    data_juicer.ops

    -data_juicer.ops.load_ops(process_list)[source]
    +data_juicer.ops.load_ops(process_list)[source]

    Load op list according to the process list from config file.

    Parameters:
    @@ -104,11 +104,11 @@
    -class data_juicer.ops.Filter(*args, **kwargs)[source]
    +class data_juicer.ops.Filter(*args, **kwargs)[source]

    Bases: OP

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Base class that removes specific info.

    Parameters:
    @@ -128,17 +128,17 @@
    -compute_stats_batched(samples, *args, **kwargs)[source]
    +compute_stats_batched(samples, *args, **kwargs)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -compute_stats_single(sample, context=False)[source]
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -157,7 +157,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -171,18 +171,18 @@
    -run(dataset, *, exporter=None, tracer=None, reduce=True)[source]
    +run(dataset, *, exporter=None, tracer=None, reduce=True)[source]
    -class data_juicer.ops.Mapper(*args, **kwargs)[source]
    +class data_juicer.ops.Mapper(*args, **kwargs)[source]

    Bases: OP

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Base class that conducts data editing.

    Parameters:
    @@ -202,12 +202,12 @@
    -process_batched(samples, *args, **kwargs)[source]
    +process_batched(samples, *args, **kwargs)[source]
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -221,18 +221,18 @@
    -run(dataset, *, exporter=None, tracer=None)[source]
    +run(dataset, *, exporter=None, tracer=None)[source]
    -class data_juicer.ops.Deduplicator(*args, **kwargs)[source]
    +class data_juicer.ops.Deduplicator(*args, **kwargs)[source]

    Bases: OP

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Base class that conducts deduplication.

    Parameters:
    @@ -252,7 +252,7 @@
    -compute_hash(sample)[source]
    +compute_hash(sample)[source]

    Compute hash values for the sample.

    Parameters:
    @@ -266,7 +266,7 @@
    -process(dataset, show_num=0)[source]
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -284,18 +284,18 @@
    -run(dataset, *, exporter=None, tracer=None, reduce=True)[source]
    +run(dataset, *, exporter=None, tracer=None, reduce=True)[source]
    -class data_juicer.ops.Selector(*args, **kwargs)[source]
    +class data_juicer.ops.Selector(*args, **kwargs)[source]

    Bases: OP

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Base class that conducts selection in dataset-level.

    Parameters:
    @@ -315,7 +315,7 @@
    -process(dataset)[source]
    +process(dataset)[source]

    Dataset –> dataset.

    Parameters:
    @@ -329,7 +329,7 @@
    -run(dataset, *, exporter=None, tracer=None)[source]
    +run(dataset, *, exporter=None, tracer=None)[source]
    diff --git a/data_juicer.ops.mapper.html b/data_juicer.ops.mapper.html index 4ed75348c..3cecc8eb9 100644 --- a/data_juicer.ops.mapper.html +++ b/data_juicer.ops.mapper.html @@ -1,19 +1,19 @@ - + - data_juicer.ops.mapper — data_juicer 1.0.0 documentation + data_juicer.ops.mapper — data_juicer 1.0.1 documentation - - - + + + @@ -78,6 +78,8 @@
  • OptimizeResponseMapper
  • PairPreferenceMapper
  • PunctuationNormalizationMapper
  • +
  • PythonFileMapper
  • +
  • PythonLambdaMapper
  • RemoveBibliographyMapper
  • RemoveCommentsMapper
  • RemoveHeaderMapper
  • @@ -140,15 +142,15 @@
    -

    data_juicer.ops.mapper

    +

    data_juicer.ops.mapper

    -class data_juicer.ops.mapper.AudioFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.AudioFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Simple wrapper for FFmpeg audio filters.

    -__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -167,7 +169,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -183,37 +185,37 @@
    -class data_juicer.ops.mapper.CalibrateQAMapper(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, reference_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.CalibrateQAMapper(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, reference_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: Mapper

    Mapper to calibrate question-answer pairs based on reference text.

    -DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对【问题】和【回答】进行校准,使其更加详细、准确。\n按照以下格式输出:\n【问题】\n校准后的问题\n【回答】\n校准后的回答'
    +DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对【问题】和【回答】进行校准,使其更加详细、准确。\n按照以下格式输出:\n【问题】\n校准后的问题\n【回答】\n校准后的回答'
    -DEFAULT_INPUT_TEMPLATE = '{reference}\n{qa_pair}'
    +DEFAULT_INPUT_TEMPLATE = '{reference}\n{qa_pair}'
    -DEFAULT_REFERENCE_TEMPLATE = '【参考信息】\n{}'
    +DEFAULT_REFERENCE_TEMPLATE = '【参考信息】\n{}'
    -DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
    +DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
    -DEFAULT_OUTPUT_PATTERN = '【问题】\\s*(.*?)\\s*【回答】\\s*(.*)'
    +DEFAULT_OUTPUT_PATTERN = '【问题】\\s*(.*?)\\s*【回答】\\s*(.*)'
    -__init__(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, reference_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +__init__(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, reference_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -238,17 +240,17 @@
    -build_input(sample)[source]
    +build_input(sample)[source]
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -264,47 +266,47 @@
    -class data_juicer.ops.mapper.CalibrateQueryMapper(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, reference_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.CalibrateQueryMapper(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, reference_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: CalibrateQAMapper

    Mapper to calibrate query in question-answer pairs based on reference text.

    -DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【问题】进行校准,        使其更加详细、准确,且仍可以由原答案回答。只输出校准后的问题,不要输出多余内容。'
    +DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【问题】进行校准,        使其更加详细、准确,且仍可以由原答案回答。只输出校准后的问题,不要输出多余内容。'
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -class data_juicer.ops.mapper.CalibrateResponseMapper(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, reference_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.CalibrateResponseMapper(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, reference_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: CalibrateQAMapper

    Mapper to calibrate response in question-answer pairs based on reference text.

    -DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【回答】进行校准,        使其更加详细、准确,且仍可以回答原问题。只输出校准后的回答,不要输出多余内容。'
    +DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【回答】进行校准,        使其更加详细、准确,且仍可以回答原问题。只输出校准后的回答,不要输出多余内容。'
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -class data_juicer.ops.mapper.ChineseConvertMapper(mode: str = 's2t', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.ChineseConvertMapper(mode: str = 's2t', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to convert Chinese between Traditional Chinese, Simplified Chinese and Japanese Kanji.

    -__init__(mode: str = 's2t', *args, **kwargs)[source]
    +__init__(mode: str = 's2t', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -342,20 +344,20 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.CleanCopyrightMapper(*args, **kwargs)[source]
    +class data_juicer.ops.mapper.CleanCopyrightMapper(*args, **kwargs)[source]

    Bases: Mapper

    Mapper to clean copyright comments at the beginning of the text samples.

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -369,19 +371,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.CleanEmailMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.CleanEmailMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to clean email in text samples.

    -__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -397,19 +399,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.CleanHtmlMapper(*args, **kwargs)[source]
    +class data_juicer.ops.mapper.CleanHtmlMapper(*args, **kwargs)[source]

    Bases: Mapper

    Mapper to clean html code in text samples.

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -423,19 +425,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.CleanIpMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.CleanIpMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to clean ipv4 and ipv6 address in text samples.

    -__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -451,19 +453,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.CleanLinksMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.CleanLinksMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to clean links like http/https/ftp in text samples.

    -__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -479,20 +481,20 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.ExpandMacroMapper(*args, **kwargs)[source]
    +class data_juicer.ops.mapper.ExpandMacroMapper(*args, **kwargs)[source]

    Bases: Mapper

    Mapper to expand macro definitions in the document body of Latex samples.

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -506,39 +508,39 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.ExtractEntityAttributeMapper(query_entities: List[str] = [], query_attributes: List[str] = [], api_model: str = 'gpt-4o', *, entity_key: str = '__dj__main_entity__', attribute_key: str = '__dj__attribute__', attribute_desc_key: str = '__dj__attribute_description__', support_text_key: str = '__dj__attribute_support_text__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt_template: str | None = None, input_template: str | None = None, attr_pattern_template: str | None = None, demo_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.ExtractEntityAttributeMapper(query_entities: List[str] = [], query_attributes: List[str] = [], api_model: str = 'gpt-4o', *, entity_key: str = '__dj__main_entity__', attribute_key: str = '__dj__attribute__', attribute_desc_key: str = '__dj__attribute_description__', support_text_key: str = '__dj__attribute_support_text__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt_template: str | None = None, input_template: str | None = None, attr_pattern_template: str | None = None, demo_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: Mapper

    Extract attributes for given entities from the text

    -DEFAULT_SYSTEM_PROMPT_TEMPLATE = '给定一段文本,从文本中总结{entity}的{attribute},并且从原文摘录最能说明该{attribute}的代表性示例。\n要求:\n- 摘录的示例应该简短。\n- 遵循如下的回复格式:\n## {attribute}:\n{entity}的{attribute}描述...\n### 代表性示例1:\n说明{entity}该{attribute}的原文摘录1...\n### 代表性示例2:\n说明{entity}该{attribute}的原文摘录2...\n...\n'
    +DEFAULT_SYSTEM_PROMPT_TEMPLATE = '给定一段文本,从文本中总结{entity}的{attribute},并且从原文摘录最能说明该{attribute}的代表性示例。\n要求:\n- 摘录的示例应该简短。\n- 遵循如下的回复格式:\n## {attribute}:\n{entity}的{attribute}描述...\n### 代表性示例1:\n说明{entity}该{attribute}的原文摘录1...\n### 代表性示例2:\n说明{entity}该{attribute}的原文摘录2...\n...\n'
    -DEFAULT_INPUT_TEMPLATE = '# 文本\n```\n{text}\n```\n'
    +DEFAULT_INPUT_TEMPLATE = '# 文本\n```\n{text}\n```\n'
    -DEFAULT_ATTR_PATTERN_TEMPLATE = '\\#\\#\\s*{attribute}:\\s*(.*?)(?=\\#\\#\\#|\\Z)'
    +DEFAULT_ATTR_PATTERN_TEMPLATE = '\\#\\#\\s*{attribute}:\\s*(.*?)(?=\\#\\#\\#|\\Z)'
    -DEFAULT_DEMON_PATTERN = '\\#\\#\\#\\s*代表性示例(\\d+):\\s*(.*?)(?=\\#\\#\\#|\\Z)'
    +DEFAULT_DEMON_PATTERN = '\\#\\#\\#\\s*代表性示例(\\d+):\\s*(.*?)(?=\\#\\#\\#|\\Z)'
    -__init__(query_entities: List[str] = [], query_attributes: List[str] = [], api_model: str = 'gpt-4o', *, entity_key: str = '__dj__main_entity__', attribute_key: str = '__dj__attribute__', attribute_desc_key: str = '__dj__attribute_description__', support_text_key: str = '__dj__attribute_support_text__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt_template: str | None = None, input_template: str | None = None, attr_pattern_template: str | None = None, demo_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +__init__(query_entities: List[str] = [], query_attributes: List[str] = [], api_model: str = 'gpt-4o', *, entity_key: str = '__dj__main_entity__', attribute_key: str = '__dj__attribute__', attribute_desc_key: str = '__dj__attribute_description__', support_text_key: str = '__dj__attribute_support_text__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt_template: str | None = None, input_template: str | None = None, attr_pattern_template: str | None = None, demo_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Initialization method. :param query_entities: Entity list to be queried. :param query_attributes: Attribute list to be queried. @@ -584,69 +586,69 @@

    -parse_output(raw_output, attribute_name)[source]
    +parse_output(raw_output, attribute_name)[source]
    -process_batched(samples, rank=None)[source]
    +process_batched(samples, rank=None)[source]
    -class data_juicer.ops.mapper.ExtractEntityRelationMapper(api_model: str = 'gpt-4o', entity_types: List[str] | None = None, *, entity_key: str = '__dj__entity__', relation_key: str = '__dj__relation__', api_endpoint: str | None = None, response_path: str | None = None, prompt_template: str | None = None, tuple_delimiter: str | None = None, record_delimiter: str | None = None, completion_delimiter: str | None = None, max_gleaning: int[int] = 1, continue_prompt: str | None = None, if_loop_prompt: str | None = None, entity_pattern: str | None = None, relation_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.ExtractEntityRelationMapper(api_model: str = 'gpt-4o', entity_types: List[str] | None = None, *, entity_key: str = '__dj__entity__', relation_key: str = '__dj__relation__', api_endpoint: str | None = None, response_path: str | None = None, prompt_template: str | None = None, tuple_delimiter: str | None = None, record_delimiter: str | None = None, completion_delimiter: str | None = None, max_gleaning: Annotated[int, Ge(ge=0)] = 1, continue_prompt: str | None = None, if_loop_prompt: str | None = None, entity_pattern: str | None = None, relation_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: Mapper

    Extract entities and relations in the text for knowledge graph.

    -DEFAULT_PROMPT_TEMPLATE = '-Goal-\nGiven a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.\n\n-Steps-\n1. Identify all entities. For each identified entity, extract the following information:\n- entity_name: Name of the entity\n- entity_type: One of the following types: [{entity_types}]\n- entity_description: Comprehensive description of the entity\'s attributes and activities\nFormat each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>\n\n2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.\nFor each pair of related entities, extract the following information:\n- source_entity: name of the source entity, as identified in step 1\n- target_entity: name of the target entity, as identified in step 1\n- relationship_description: explanation as to why you think the source entity and the target entity are related to each other\n- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity\n- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details\nFormat each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)\n\n3. Return output in the language of the given text as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.\n\n4. When finished, output {completion_delimiter}\n\n######################\n-Examples-\n######################\nExample 1:\n\nEntity_types: [person, technology, mission, organization, location]\nText:\n```\nwhile Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor\'s authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan\'s shared commitment to discovery was an unspoken rebellion against Cruz\'s narrowing vision of control and order.\n\nThen Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”\n\nThe underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor\'s, a wordless clash of wills softening into an uneasy truce.\n\nIt was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths\n```\n################\nOutput:\n("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter}\n("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter}\n("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter}\n("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter}\n("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter}\n("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor\'s authoritarian certainty and observes changes in Taylor\'s attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter}\n("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz\'s vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter}\n("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter}\n("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan\'s commitment to discovery is in rebellion against Cruz\'s vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}\n("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}\n#############################\nExample 2:\n\nEntity_types: [人物, 技术, 任务, 组织, 地点]\nText:\n```\n他们不再是单纯的执行者;他们已成为某个超越星辰与条纹的领域的信息守护者。这一使命的提升不能被规则和既定协议所束缚——它需要一种新的视角,一种新的决心。\n\n随着与华盛顿的通讯在背景中嗡嗡作响,对话中的紧张情绪通过嘟嘟声和静电噪音贯穿始终。团队站立着,一股不祥的气息笼罩着他们。显然,他们在接下来几个小时内做出的决定可能会重新定义人类在宇宙中的位置,或者将他们置于无知和潜在危险之中。\n\n随着与星辰的联系变得更加牢固,小组开始处理逐渐成形的警告,从被动接受者转变为积极参与者。梅瑟后来的直觉占据了上风——团队的任务已经演变,不再仅仅是观察和报告,而是互动和准备。一场蜕变已经开始,而“杜尔塞行动”则以他们大胆的新频率震动,这种基调不是由世俗设定的\n```\n#############\nOutput:\n("entity"{tuple_delimiter}"华盛顿"{tuple_delimiter}"地点"{tuple_delimiter}"华盛顿是正在接收通讯的地方,表明其在决策过程中的重要性。"){record_delimiter}\n("entity"{tuple_delimiter}"杜尔塞行动"{tuple_delimiter}"任务"{tuple_delimiter}"杜尔塞行动被描述为一项已演变为互动和准备的任务,显示出目标和活动的重大转变。"){record_delimiter}\n("entity"{tuple_delimiter}"团队"{tuple_delimiter}"组织"{tuple_delimiter}"团队被描绘成一群从被动观察者转变为积极参与者的人,展示了他们角色的动态变化。"){record_delimiter}\n("relationship"{tuple_delimiter}"团队"{tuple_delimiter}"华盛顿"{tuple_delimiter}"团队收到来自华盛顿的通讯,这影响了他们的决策过程。"{tuple_delimiter}"决策、外部影响"{tuple_delimiter}7){record_delimiter}\n("relationship"{tuple_delimiter}"团队"{tuple_delimiter}"杜尔塞行动"{tuple_delimiter}"团队直接参与杜尔塞行动,执行其演变后的目标和活动。"{tuple_delimiter}"任务演变、积极参与"{tuple_delimiter}9){completion_delimiter}\n#############################\nExample 3:\n\nEntity_types: [person, role, technology, organization, event, location, concept]\nText:\n```\ntheir voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data.\n\n"It\'s like it\'s learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers\' a whole new meaning."\n\nAlex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."\n\nTogether, they stood on the edge of the unknown, forging humanity\'s response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.\n\nThe encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation\n```\n#############\nOutput:\n("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter}\n("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter}\n("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter}\n("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter}\n("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter}\n("entity"{tuple_delimiter}"Humanity\'s Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity\'s Response is the collective action taken by Alex\'s team in response to a message from an unknown intelligence."){record_delimiter}\n("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter}\n("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter}\n("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity\'s Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity\'s Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter}\n("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter}\n#############################\n-Real Data-\n######################\nEntity_types: [{entity_types}]\nText:\n```\n{input_text}\n```\n######################\nOutput:\n'
    +DEFAULT_PROMPT_TEMPLATE = '-Goal-\nGiven a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.\n\n-Steps-\n1. Identify all entities. For each identified entity, extract the following information:\n- entity_name: Name of the entity\n- entity_type: One of the following types: [{entity_types}]\n- entity_description: Comprehensive description of the entity\'s attributes and activities\nFormat each entity as ("entity"{tuple_delimiter}<entity_name>{tuple_delimiter}<entity_type>{tuple_delimiter}<entity_description>\n\n2. From the entities identified in step 1, identify all pairs of (source_entity, target_entity) that are *clearly related* to each other.\nFor each pair of related entities, extract the following information:\n- source_entity: name of the source entity, as identified in step 1\n- target_entity: name of the target entity, as identified in step 1\n- relationship_description: explanation as to why you think the source entity and the target entity are related to each other\n- relationship_strength: a numeric score indicating strength of the relationship between the source entity and target entity\n- relationship_keywords: one or more high-level key words that summarize the overarching nature of the relationship, focusing on concepts or themes rather than specific details\nFormat each relationship as ("relationship"{tuple_delimiter}<source_entity>{tuple_delimiter}<target_entity>{tuple_delimiter}<relationship_description>{tuple_delimiter}<relationship_keywords>{tuple_delimiter}<relationship_strength>)\n\n3. Return output in the language of the given text as a single list of all the entities and relationships identified in steps 1 and 2. Use **{record_delimiter}** as the list delimiter.\n\n4. When finished, output {completion_delimiter}\n\n######################\n-Examples-\n######################\nExample 1:\n\nEntity_types: [person, technology, mission, organization, location]\nText:\n```\nwhile Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor\'s authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan\'s shared commitment to discovery was an unspoken rebellion against Cruz\'s narrowing vision of control and order.\n\nThen Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”\n\nThe underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor\'s, a wordless clash of wills softening into an uneasy truce.\n\nIt was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths\n```\n################\nOutput:\n("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is a character who experiences frustration and is observant of the dynamics among other characters."){record_delimiter}\n("entity"{tuple_delimiter}"Taylor"{tuple_delimiter}"person"{tuple_delimiter}"Taylor is portrayed with authoritarian certainty and shows a moment of reverence towards a device, indicating a change in perspective."){record_delimiter}\n("entity"{tuple_delimiter}"Jordan"{tuple_delimiter}"person"{tuple_delimiter}"Jordan shares a commitment to discovery and has a significant interaction with Taylor regarding a device."){record_delimiter}\n("entity"{tuple_delimiter}"Cruz"{tuple_delimiter}"person"{tuple_delimiter}"Cruz is associated with a vision of control and order, influencing the dynamics among other characters."){record_delimiter}\n("entity"{tuple_delimiter}"The Device"{tuple_delimiter}"technology"{tuple_delimiter}"The Device is central to the story, with potential game-changing implications, and is revered by Taylor."){record_delimiter}\n("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Taylor"{tuple_delimiter}"Alex is affected by Taylor\'s authoritarian certainty and observes changes in Taylor\'s attitude towards the device."{tuple_delimiter}"power dynamics, perspective shift"{tuple_delimiter}7){record_delimiter}\n("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Jordan"{tuple_delimiter}"Alex and Jordan share a commitment to discovery, which contrasts with Cruz\'s vision."{tuple_delimiter}"shared goals, rebellion"{tuple_delimiter}6){record_delimiter}\n("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"Jordan"{tuple_delimiter}"Taylor and Jordan interact directly regarding the device, leading to a moment of mutual respect and an uneasy truce."{tuple_delimiter}"conflict resolution, mutual respect"{tuple_delimiter}8){record_delimiter}\n("relationship"{tuple_delimiter}"Jordan"{tuple_delimiter}"Cruz"{tuple_delimiter}"Jordan\'s commitment to discovery is in rebellion against Cruz\'s vision of control and order."{tuple_delimiter}"ideological conflict, rebellion"{tuple_delimiter}5){record_delimiter}\n("relationship"{tuple_delimiter}"Taylor"{tuple_delimiter}"The Device"{tuple_delimiter}"Taylor shows reverence towards the device, indicating its importance and potential impact."{tuple_delimiter}"reverence, technological significance"{tuple_delimiter}9){record_delimiter}\n#############################\nExample 2:\n\nEntity_types: [人物, 技术, 任务, 组织, 地点]\nText:\n```\n他们不再是单纯的执行者;他们已成为某个超越星辰与条纹的领域的信息守护者。这一使命的提升不能被规则和既定协议所束缚——它需要一种新的视角,一种新的决心。\n\n随着与华盛顿的通讯在背景中嗡嗡作响,对话中的紧张情绪通过嘟嘟声和静电噪音贯穿始终。团队站立着,一股不祥的气息笼罩着他们。显然,他们在接下来几个小时内做出的决定可能会重新定义人类在宇宙中的位置,或者将他们置于无知和潜在危险之中。\n\n随着与星辰的联系变得更加牢固,小组开始处理逐渐成形的警告,从被动接受者转变为积极参与者。梅瑟后来的直觉占据了上风——团队的任务已经演变,不再仅仅是观察和报告,而是互动和准备。一场蜕变已经开始,而“杜尔塞行动”则以他们大胆的新频率震动,这种基调不是由世俗设定的\n```\n#############\nOutput:\n("entity"{tuple_delimiter}"华盛顿"{tuple_delimiter}"地点"{tuple_delimiter}"华盛顿是正在接收通讯的地方,表明其在决策过程中的重要性。"){record_delimiter}\n("entity"{tuple_delimiter}"杜尔塞行动"{tuple_delimiter}"任务"{tuple_delimiter}"杜尔塞行动被描述为一项已演变为互动和准备的任务,显示出目标和活动的重大转变。"){record_delimiter}\n("entity"{tuple_delimiter}"团队"{tuple_delimiter}"组织"{tuple_delimiter}"团队被描绘成一群从被动观察者转变为积极参与者的人,展示了他们角色的动态变化。"){record_delimiter}\n("relationship"{tuple_delimiter}"团队"{tuple_delimiter}"华盛顿"{tuple_delimiter}"团队收到来自华盛顿的通讯,这影响了他们的决策过程。"{tuple_delimiter}"决策、外部影响"{tuple_delimiter}7){record_delimiter}\n("relationship"{tuple_delimiter}"团队"{tuple_delimiter}"杜尔塞行动"{tuple_delimiter}"团队直接参与杜尔塞行动,执行其演变后的目标和活动。"{tuple_delimiter}"任务演变、积极参与"{tuple_delimiter}9){completion_delimiter}\n#############################\nExample 3:\n\nEntity_types: [person, role, technology, organization, event, location, concept]\nText:\n```\ntheir voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data.\n\n"It\'s like it\'s learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers\' a whole new meaning."\n\nAlex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."\n\nTogether, they stood on the edge of the unknown, forging humanity\'s response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.\n\nThe encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation\n```\n#############\nOutput:\n("entity"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"person"{tuple_delimiter}"Sam Rivera is a member of a team working on communicating with an unknown intelligence, showing a mix of awe and anxiety."){record_delimiter}\n("entity"{tuple_delimiter}"Alex"{tuple_delimiter}"person"{tuple_delimiter}"Alex is the leader of a team attempting first contact with an unknown intelligence, acknowledging the significance of their task."){record_delimiter}\n("entity"{tuple_delimiter}"Control"{tuple_delimiter}"concept"{tuple_delimiter}"Control refers to the ability to manage or govern, which is challenged by an intelligence that writes its own rules."){record_delimiter}\n("entity"{tuple_delimiter}"Intelligence"{tuple_delimiter}"concept"{tuple_delimiter}"Intelligence here refers to an unknown entity capable of writing its own rules and learning to communicate."){record_delimiter}\n("entity"{tuple_delimiter}"First Contact"{tuple_delimiter}"event"{tuple_delimiter}"First Contact is the potential initial communication between humanity and an unknown intelligence."){record_delimiter}\n("entity"{tuple_delimiter}"Humanity\'s Response"{tuple_delimiter}"event"{tuple_delimiter}"Humanity\'s Response is the collective action taken by Alex\'s team in response to a message from an unknown intelligence."){record_delimiter}\n("relationship"{tuple_delimiter}"Sam Rivera"{tuple_delimiter}"Intelligence"{tuple_delimiter}"Sam Rivera is directly involved in the process of learning to communicate with the unknown intelligence."{tuple_delimiter}"communication, learning process"{tuple_delimiter}9){record_delimiter}\n("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"First Contact"{tuple_delimiter}"Alex leads the team that might be making the First Contact with the unknown intelligence."{tuple_delimiter}"leadership, exploration"{tuple_delimiter}10){record_delimiter}\n("relationship"{tuple_delimiter}"Alex"{tuple_delimiter}"Humanity\'s Response"{tuple_delimiter}"Alex and his team are the key figures in Humanity\'s Response to the unknown intelligence."{tuple_delimiter}"collective action, cosmic significance"{tuple_delimiter}8){record_delimiter}\n("relationship"{tuple_delimiter}"Control"{tuple_delimiter}"Intelligence"{tuple_delimiter}"The concept of Control is challenged by the Intelligence that writes its own rules."{tuple_delimiter}"power dynamics, autonomy"{tuple_delimiter}7){record_delimiter}\n#############################\n-Real Data-\n######################\nEntity_types: [{entity_types}]\nText:\n```\n{input_text}\n```\n######################\nOutput:\n'
    -DEFAULT_CONTINUE_PROMPT = 'MANY entities were missed in the last extraction.  Add them below using the same format:\n'
    +DEFAULT_CONTINUE_PROMPT = 'MANY entities were missed in the last extraction.  Add them below using the same format:\n'
    -DEFAULT_IF_LOOP_PROMPT = 'It appears some entities may have still been missed.  Answer YES | NO if there are still entities that need to be added.\n'
    +DEFAULT_IF_LOOP_PROMPT = 'It appears some entities may have still been missed.  Answer YES | NO if there are still entities that need to be added.\n'
    -DEFAULT_ENTITY_TYPES = ['organization', 'person', 'geo', 'event']
    +DEFAULT_ENTITY_TYPES = ['organization', 'person', 'geo', 'event']
    -DEFAULT_TUPLE_DELIMITER = '<|>'
    +DEFAULT_TUPLE_DELIMITER = '<|>'
    -DEFAULT_RECORD_DELIMITER = '##'
    +DEFAULT_RECORD_DELIMITER = '##'
    -DEFAULT_COMPLETION_DELIMITER = '<|COMPLETE|>'
    +DEFAULT_COMPLETION_DELIMITER = '<|COMPLETE|>'
    -DEFAULT_ENTITY_PATTERN = '\\("entity"(.*?)\\)'
    +DEFAULT_ENTITY_PATTERN = '\\("entity"(.*?)\\)'
    -DEFAULT_RELATION_PATTERN = '\\("relationship"(.*?)\\)'
    +DEFAULT_RELATION_PATTERN = '\\("relationship"(.*?)\\)'
    -__init__(api_model: str = 'gpt-4o', entity_types: List[str] | None = None, *, entity_key: str = '__dj__entity__', relation_key: str = '__dj__relation__', api_endpoint: str | None = None, response_path: str | None = None, prompt_template: str | None = None, tuple_delimiter: str | None = None, record_delimiter: str | None = None, completion_delimiter: str | None = None, max_gleaning: int[int] = 1, continue_prompt: str | None = None, if_loop_prompt: str | None = None, entity_pattern: str | None = None, relation_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +__init__(api_model: str = 'gpt-4o', entity_types: List[str] | None = None, *, entity_key: str = '__dj__entity__', relation_key: str = '__dj__relation__', api_endpoint: str | None = None, response_path: str | None = None, prompt_template: str | None = None, tuple_delimiter: str | None = None, record_delimiter: str | None = None, completion_delimiter: str | None = None, max_gleaning: Annotated[int, Ge(ge=0)] = 1, continue_prompt: str | None = None, if_loop_prompt: str | None = None, entity_pattern: str | None = None, relation_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Initialization method. :param api_model: API model name. :param entity_types: Pre-defined entity types for knowledge graph. @@ -689,22 +691,22 @@

    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -add_message(messages, role, content)[source]
    +add_message(messages, role, content)[source]
    -light_rag_extraction(messages, rank=None)[source]
    +light_rag_extraction(messages, rank=None)[source]
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -720,27 +722,27 @@
    -class data_juicer.ops.mapper.ExtractEventMapper(api_model: str = 'gpt-4o', *, event_desc_key: str = '__dj__event_description__', relevant_char_key: str = '__dj__relevant_characters__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.ExtractEventMapper(api_model: str = 'gpt-4o', *, event_desc_key: str = '__dj__event_description__', relevant_char_key: str = '__dj__relevant_characters__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: Mapper

    Extract events and relevant characters in the text

    -DEFAULT_SYSTEM_PROMPT = '给定一段文本,对文本的情节进行分点总结,并抽取与情节相关的人物。\n要求:\n- 尽量不要遗漏内容,不要添加文本中没有的情节,符合原文事实\n- 联系上下文说明前因后果,但仍然需要符合事实\n- 不要包含主观看法\n- 注意要尽可能保留文本的专有名词\n- 注意相关人物需要在对应情节中出现\n- 只抽取情节中的主要人物,不要遗漏情节的主要人物\n- 总结格式如下:\n### 情节1:\n- **情节描述**: ...\n- **相关人物**:人物1,人物2,人物3,...\n### 情节2:\n- **情节描述**: ...\n- **相关人物**:人物1,人物2,...\n### 情节3:\n- **情节描述**: ...\n- **相关人物**:人物1,...\n...\n'
    +DEFAULT_SYSTEM_PROMPT = '给定一段文本,对文本的情节进行分点总结,并抽取与情节相关的人物。\n要求:\n- 尽量不要遗漏内容,不要添加文本中没有的情节,符合原文事实\n- 联系上下文说明前因后果,但仍然需要符合事实\n- 不要包含主观看法\n- 注意要尽可能保留文本的专有名词\n- 注意相关人物需要在对应情节中出现\n- 只抽取情节中的主要人物,不要遗漏情节的主要人物\n- 总结格式如下:\n### 情节1:\n- **情节描述**: ...\n- **相关人物**:人物1,人物2,人物3,...\n### 情节2:\n- **情节描述**: ...\n- **相关人物**:人物1,人物2,...\n### 情节3:\n- **情节描述**: ...\n- **相关人物**:人物1,...\n...\n'
    -DEFAULT_INPUT_TEMPLATE = '# 文本\n```\n{text}\n```\n'
    +DEFAULT_INPUT_TEMPLATE = '# 文本\n```\n{text}\n```\n'
    -DEFAULT_OUTPUT_PATTERN = '\n        \\#\\#\\#\\s*情节(\\d+):\\s*\n        -\\s*\\*\\*情节描述\\*\\*\\s*:\\s*(.*?)\\s*\n        -\\s*\\*\\*相关人物\\*\\*\\s*:\\s*(.*?)(?=\\#\\#\\#|\\Z)\n    '
    +DEFAULT_OUTPUT_PATTERN = '\n        \\#\\#\\#\\s*情节(\\d+):\\s*\n        -\\s*\\*\\*情节描述\\*\\*\\s*:\\s*(.*?)\\s*\n        -\\s*\\*\\*相关人物\\*\\*\\s*:\\s*(.*?)(?=\\#\\#\\#|\\Z)\n    '
    -__init__(api_model: str = 'gpt-4o', *, event_desc_key: str = '__dj__event_description__', relevant_char_key: str = '__dj__relevant_characters__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +__init__(api_model: str = 'gpt-4o', *, event_desc_key: str = '__dj__event_description__', relevant_char_key: str = '__dj__relevant_characters__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Initialization method. :param api_model: API model name. :param event_desc_key: The field name to store the event descriptions.

    @@ -773,39 +775,39 @@
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -process_batched(samples, rank=None)[source]
    +process_batched(samples, rank=None)[source]
    -class data_juicer.ops.mapper.ExtractKeywordMapper(api_model: str = 'gpt-4o', *, keyword_key: str = '__dj__keyword__', api_endpoint: str | None = None, response_path: str | None = None, prompt_template: str | None = None, completion_delimiter: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.ExtractKeywordMapper(api_model: str = 'gpt-4o', *, keyword_key: str = '__dj__keyword__', api_endpoint: str | None = None, response_path: str | None = None, prompt_template: str | None = None, completion_delimiter: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: Mapper

    Generate keywords for the text

    -DEFAULT_PROMPT_TEMPLATE = '-Goal-\nGiven a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.\n\n-Steps-\n1. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.\nFormat the content-level key words as ("content_keywords" <high_level_keywords>)\n\n3. Return output in the language of the given text.\n\n4. When finished, output {completion_delimiter}\n\n######################\n-Examples-\n######################\nExample 1:\n\nText:\n```\nwhile Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor\'s authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan\'s shared commitment to discovery was an unspoken rebellion against Cruz\'s narrowing vision of control and order.\n\nThen Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”\n\nThe underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor\'s, a wordless clash of wills softening into an uneasy truce.\n\nIt was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths\n```\n################\nOutput:\n("content_keywords" "power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}\n#############################\nExample 2:\n\nText:\n```\n他们不再是单纯的执行者;他们已成为某个超越星辰与条纹的领域的信息守护者。这一使命的提升不能被规则和既定协议所束缚——它需要一种新的视角,一种新的决心。\n\n随着与华盛顿的通讯在背景中嗡嗡作响,对话中的紧张情绪通过嘟嘟声和静电噪音贯穿始终。团队站立着,一股不祥的气息笼罩着他们。显然,他们在接下来几个小时内做出的决定可能会重新定义人类在宇宙中的位置,或者将他们置于无知和潜在危险之中。\n\n随着与星辰的联系变得更加牢固,小组开始处理逐渐成形的警告,从被动接受者转变为积极参与者。梅瑟后来的直觉占据了上风——团队的任务已经演变,不再仅仅是观察和报告,而是互动和准备。一场蜕变已经开始,而“杜尔塞行动”则以他们大胆的新频率震动,这种基调不是由世俗设定的\n```\n#############\nOutput:\n("content_keywords" "任务演变, 决策制定, 积极参与, 宇宙意义"){completion_delimiter}\n#############################\nExample 3:\n\nEntity_types: [person, role, technology, organization, event, location, concept]\nText:\n```\ntheir voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data.\n\n"It\'s like it\'s learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers\' a whole new meaning."\n\nAlex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."\n\nTogether, they stood on the edge of the unknown, forging humanity\'s response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.\n\nThe encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation\n```\n#############\nOutput:\n("content_keywords" "first contact, control, communication, cosmic significance"){completion_delimiter}\n-Real Data-\n######################\nText:\n```\n{input_text}\n```\n######################\nOutput:\n'
    +DEFAULT_PROMPT_TEMPLATE = '-Goal-\nGiven a text document that is potentially relevant to this activity and a list of entity types, identify all entities of those types from the text and all relationships among the identified entities.\n\n-Steps-\n1. Identify high-level key words that summarize the main concepts, themes, or topics of the entire text. These should capture the overarching ideas present in the document.\nFormat the content-level key words as ("content_keywords" <high_level_keywords>)\n\n3. Return output in the language of the given text.\n\n4. When finished, output {completion_delimiter}\n\n######################\n-Examples-\n######################\nExample 1:\n\nText:\n```\nwhile Alex clenched his jaw, the buzz of frustration dull against the backdrop of Taylor\'s authoritarian certainty. It was this competitive undercurrent that kept him alert, the sense that his and Jordan\'s shared commitment to discovery was an unspoken rebellion against Cruz\'s narrowing vision of control and order.\n\nThen Taylor did something unexpected. They paused beside Jordan and, for a moment, observed the device with something akin to reverence. “If this tech can be understood..." Taylor said, their voice quieter, "It could change the game for us. For all of us.”\n\nThe underlying dismissal earlier seemed to falter, replaced by a glimpse of reluctant respect for the gravity of what lay in their hands. Jordan looked up, and for a fleeting heartbeat, their eyes locked with Taylor\'s, a wordless clash of wills softening into an uneasy truce.\n\nIt was a small transformation, barely perceptible, but one that Alex noted with an inward nod. They had all been brought here by different paths\n```\n################\nOutput:\n("content_keywords" "power dynamics, ideological conflict, discovery, rebellion"){completion_delimiter}\n#############################\nExample 2:\n\nText:\n```\n他们不再是单纯的执行者;他们已成为某个超越星辰与条纹的领域的信息守护者。这一使命的提升不能被规则和既定协议所束缚——它需要一种新的视角,一种新的决心。\n\n随着与华盛顿的通讯在背景中嗡嗡作响,对话中的紧张情绪通过嘟嘟声和静电噪音贯穿始终。团队站立着,一股不祥的气息笼罩着他们。显然,他们在接下来几个小时内做出的决定可能会重新定义人类在宇宙中的位置,或者将他们置于无知和潜在危险之中。\n\n随着与星辰的联系变得更加牢固,小组开始处理逐渐成形的警告,从被动接受者转变为积极参与者。梅瑟后来的直觉占据了上风——团队的任务已经演变,不再仅仅是观察和报告,而是互动和准备。一场蜕变已经开始,而“杜尔塞行动”则以他们大胆的新频率震动,这种基调不是由世俗设定的\n```\n#############\nOutput:\n("content_keywords" "任务演变, 决策制定, 积极参与, 宇宙意义"){completion_delimiter}\n#############################\nExample 3:\n\nEntity_types: [person, role, technology, organization, event, location, concept]\nText:\n```\ntheir voice slicing through the buzz of activity. "Control may be an illusion when facing an intelligence that literally writes its own rules," they stated stoically, casting a watchful eye over the flurry of data.\n\n"It\'s like it\'s learning to communicate," offered Sam Rivera from a nearby interface, their youthful energy boding a mix of awe and anxiety. "This gives talking to strangers\' a whole new meaning."\n\nAlex surveyed his team—each face a study in concentration, determination, and not a small measure of trepidation. "This might well be our first contact," he acknowledged, "And we need to be ready for whatever answers back."\n\nTogether, they stood on the edge of the unknown, forging humanity\'s response to a message from the heavens. The ensuing silence was palpable—a collective introspection about their role in this grand cosmic play, one that could rewrite human history.\n\nThe encrypted dialogue continued to unfold, its intricate patterns showing an almost uncanny anticipation\n```\n#############\nOutput:\n("content_keywords" "first contact, control, communication, cosmic significance"){completion_delimiter}\n-Real Data-\n######################\nText:\n```\n{input_text}\n```\n######################\nOutput:\n'
    -DEFAULT_COMPLETION_DELIMITER = '<|COMPLETE|>'
    +DEFAULT_COMPLETION_DELIMITER = '<|COMPLETE|>'
    -DEFAULT_OUTPUT_PATTERN = '\\("content_keywords"(.*?)\\)'
    +DEFAULT_OUTPUT_PATTERN = '\\("content_keywords"(.*?)\\)'
    -__init__(api_model: str = 'gpt-4o', *, keyword_key: str = '__dj__keyword__', api_endpoint: str | None = None, response_path: str | None = None, prompt_template: str | None = None, completion_delimiter: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +__init__(api_model: str = 'gpt-4o', *, keyword_key: str = '__dj__keyword__', api_endpoint: str | None = None, response_path: str | None = None, prompt_template: str | None = None, completion_delimiter: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Initialization method. :param api_model: API model name. :param keyword_key: The field name to store the keywords. It’s

    @@ -835,12 +837,12 @@
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -856,27 +858,27 @@
    -class data_juicer.ops.mapper.ExtractNicknameMapper(api_model: str = 'gpt-4o', *, nickname_key: str = '__dj__nickname__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.ExtractNicknameMapper(api_model: str = 'gpt-4o', *, nickname_key: str = '__dj__nickname__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: Mapper

    Extract nickname relationship in the text.

    -DEFAULT_SYSTEM_PROMPT = '给定你一段文本,你的任务是将人物之间的称呼方式(昵称)提取出来。\n要求:\n- 需要给出说话人对被称呼人的称呼,不要搞反了。\n- 相同的说话人和被称呼人最多给出一个最常用的称呼。\n- 请不要输出互相没有昵称的称呼方式。\n- 输出格式如下:\n```\n### 称呼方式1\n- **说话人**:...\n- **被称呼人**:...\n- **...对...的昵称**:...\n### 称呼方式2\n- **说话人**:...\n- **被称呼人**:...\n- **...对...的昵称**:...\n### 称呼方式3\n- **说话人**:...\n- **被称呼人**:...\n- **...对...的昵称**:...\n...\n```\n'
    +DEFAULT_SYSTEM_PROMPT = '给定你一段文本,你的任务是将人物之间的称呼方式(昵称)提取出来。\n要求:\n- 需要给出说话人对被称呼人的称呼,不要搞反了。\n- 相同的说话人和被称呼人最多给出一个最常用的称呼。\n- 请不要输出互相没有昵称的称呼方式。\n- 输出格式如下:\n```\n### 称呼方式1\n- **说话人**:...\n- **被称呼人**:...\n- **...对...的昵称**:...\n### 称呼方式2\n- **说话人**:...\n- **被称呼人**:...\n- **...对...的昵称**:...\n### 称呼方式3\n- **说话人**:...\n- **被称呼人**:...\n- **...对...的昵称**:...\n...\n```\n'
    -DEFAULT_INPUT_TEMPLATE = '# 文本\n```\n{text}\n```\n'
    +DEFAULT_INPUT_TEMPLATE = '# 文本\n```\n{text}\n```\n'
    -DEFAULT_OUTPUT_PATTERN = '\n        \\#\\#\\#\\s*称呼方式(\\d+)\\s*\n        -\\s*\\*\\*说话人\\*\\*\\s*:\\s*(.*?)\\s*\n        -\\s*\\*\\*被称呼人\\*\\*\\s*:\\s*(.*?)\\s*\n        -\\s*\\*\\*(.*?)对(.*?)的昵称\\*\\*\\s*:\\s*(.*?)(?=\\#\\#\\#|\\Z) # for double check\n    '
    +DEFAULT_OUTPUT_PATTERN = '\n        \\#\\#\\#\\s*称呼方式(\\d+)\\s*\n        -\\s*\\*\\*说话人\\*\\*\\s*:\\s*(.*?)\\s*\n        -\\s*\\*\\*被称呼人\\*\\*\\s*:\\s*(.*?)\\s*\n        -\\s*\\*\\*(.*?)对(.*?)的昵称\\*\\*\\s*:\\s*(.*?)(?=\\#\\#\\#|\\Z) # for double check\n    '
    -__init__(api_model: str = 'gpt-4o', *, nickname_key: str = '__dj__nickname__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, try_num: int[int] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +__init__(api_model: str = 'gpt-4o', *, nickname_key: str = '__dj__nickname__', api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, try_num: Annotated[int, Gt(gt=0)] = 3, drop_text: bool = False, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Initialization method. :param api_model: API model name. :param nickname_key: The field name to store the nickname

    @@ -906,12 +908,12 @@
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -927,12 +929,12 @@
    -class data_juicer.ops.mapper.FixUnicodeMapper(normalization: str | None = None, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.FixUnicodeMapper(normalization: str | None = None, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to fix unicode errors in text samples.

    -__init__(normalization: str | None = None, *args, **kwargs)[source]
    +__init__(normalization: str | None = None, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -949,14 +951,14 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.GenerateQAFromExamplesMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', example_num: int[int] = 3, similarity_threshold: float = 0.7, system_prompt: str | None = None, input_template: str | None = None, example_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
    +class data_juicer.ops.mapper.GenerateQAFromExamplesMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', example_num: Annotated[int, Gt(gt=0)] = 3, similarity_threshold: float = 0.7, system_prompt: str | None = None, input_template: str | None = None, example_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: Mapper

    Mapper to generate question and answer pairs from examples. You should configure an empty dataset in your yaml config file: @@ -972,32 +974,32 @@ the length of the empty dataset.

    -DEFAULT_SYSTEM_PROMPT = '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求:\n1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n'
    +DEFAULT_SYSTEM_PROMPT = '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求:\n1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n'
    -DEFAULT_INPUT_TEMPLATE = '{}'
    +DEFAULT_INPUT_TEMPLATE = '{}'
    -DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}'
    +DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}'
    -DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n'
    +DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n'
    -DEFAULT_OUTPUT_PATTERN = '【问题】(.*?)【回答】(.*?)(?=【问题】|$)'
    +DEFAULT_OUTPUT_PATTERN = '【问题】(.*?)【回答】(.*?)(?=【问题】|$)'
    -__init__(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', example_num: int[int] = 3, similarity_threshold: float = 0.7, system_prompt: str | None = None, input_template: str | None = None, example_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
    +__init__(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', example_num: Annotated[int, Gt(gt=0)] = 3, similarity_threshold: float = 0.7, system_prompt: str | None = None, input_template: str | None = None, example_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1035,17 +1037,17 @@
    -build_input(qa_examples)[source]
    +build_input(qa_examples)[source]
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -1061,7 +1063,7 @@
    -class data_juicer.ops.mapper.GenerateQAFromTextMapper(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
    +class data_juicer.ops.mapper.GenerateQAFromTextMapper(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: Mapper

    Mapper to generate question and answer pairs from text. Recommended model list: [

    @@ -1078,7 +1080,7 @@ and are suitable for Chinese.

    -__init__(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
    +__init__(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1114,24 +1116,24 @@
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -process_batched(samples, rank=None)[source]
    +process_batched(samples, rank=None)[source]
    -class data_juicer.ops.mapper.ImageBlurMapper(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.ImageBlurMapper(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to blur images.

    -__init__(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +__init__(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1149,7 +1151,7 @@
    -process_single(sample, context=False)[source]
    +process_single(sample, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -1165,13 +1167,13 @@
    -class data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate samples whose texts are generated based on gpt-4-visison and the image.

    -__init__(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +__init__(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1211,20 +1213,20 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.ImageCaptioningMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.ImageCaptioningMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: Annotated[int, Gt(gt=0)] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate samples whose captions are generated based on another model and the figure.

    -__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
    +__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: Annotated[int, Gt(gt=0)] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1278,7 +1280,7 @@
    -process_batched(samples, rank=None)[source]
    +process_batched(samples, rank=None)[source]

    Note

    This is a batched_OP, whose input and output type are @@ -1290,7 +1292,7 @@

    Parameters:
    -

    samples

    +

    samples

    Returns:

    @@ -1302,12 +1304,12 @@
    -class data_juicer.ops.mapper.ImageDiffusionMapper(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.ImageDiffusionMapper(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] = 0.8, guidance_scale: float = 7.5, aug_num: Annotated[int, Gt(gt=0)] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]

    Bases: Mapper

    Generate image by diffusion model

    -__init__(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +__init__(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] = 0.8, guidance_scale: float = 7.5, aug_num: Annotated[int, Gt(gt=0)] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1371,7 +1373,7 @@
    -process_batched(samples, rank=None, context=False)[source]
    +process_batched(samples, rank=None, context=False)[source]

    Note

    This is a batched_OP, whose the input and output type are @@ -1381,7 +1383,7 @@

    Parameters:
    -

    samples

    +

    samples

    Returns:

    @@ -1393,12 +1395,12 @@
    -class data_juicer.ops.mapper.ImageFaceBlurMapper(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float[float] = 2, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.ImageFaceBlurMapper(cv_classifier: str = '', blur_type: str = 'gaussian', radius: Annotated[float, Ge(ge=0)] = 2, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to blur faces detected in images.

    -__init__(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float[float] = 2, *args, **kwargs)[source]
    +__init__(cv_classifier: str = '', blur_type: str = 'gaussian', radius: Annotated[float, Ge(ge=0)] = 2, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1417,7 +1419,7 @@
    -process_single(sample, context=False)[source]
    +process_single(sample, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -1433,12 +1435,12 @@
    -class data_juicer.ops.mapper.ImageTaggingMapper(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.ImageTaggingMapper(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate image tags.

    -__init__(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]
    +__init__(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]

    Initialization method. :param tag_field_name: the field name to store the tags. It’s

    @@ -1456,7 +1458,7 @@
    -process_single(sample, rank=None, context=False)[source]
    +process_single(sample, rank=None, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -1472,12 +1474,12 @@
    -class data_juicer.ops.mapper.NlpaugEnMapper(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.NlpaugEnMapper(sequential: bool = False, aug_num: Annotated[int, Gt(gt=0)] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to simply augment samples in English based on nlpaug library.

    -__init__(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
    +__init__(sequential: bool = False, aug_num: Annotated[int, Gt(gt=0)] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]

    Initialization method. All augmentation methods use default parameters in default. We recommend you to only use 1-3 augmentation methods at a time. Otherwise, the semantics of samples might be changed @@ -1534,19 +1536,19 @@

    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.NlpcdaZhMapper(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, replace_similar_word: bool = False, replace_homophone_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, replace_equivalent_num: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.NlpcdaZhMapper(sequential: bool = False, aug_num: Annotated[int, Gt(gt=0)] = 1, keep_original_sample: bool = True, replace_similar_word: bool = False, replace_homophone_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, replace_equivalent_num: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to simply augment samples in Chinese based on nlpcda library.

    -__init__(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, replace_similar_word: bool = False, replace_homophone_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, replace_equivalent_num: bool = False, *args, **kwargs)[source]
    +__init__(sequential: bool = False, aug_num: Annotated[int, Gt(gt=0)] = 1, keep_original_sample: bool = True, replace_similar_word: bool = False, replace_homophone_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, replace_equivalent_num: bool = False, *args, **kwargs)[source]

    Initialization method. All augmentation methods use default parameters in default. We recommend you to only use 1-3 augmentation methods at a time. Otherwise, the semantics of samples might be changed @@ -1593,39 +1595,39 @@

    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.OptimizeQAMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
    +class data_juicer.ops.mapper.OptimizeQAMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: Mapper

    Mapper to optimize question-answer pairs.

    -DEFAULT_SYSTEM_PROMPT = '请优化输入的问答对,使【问题】和【回答】都更加详细、准确。必须按照以下标记格式,直接输出优化后的问答对:\n【问题】\n优化后的问题\n【回答】\n优化后的回答'
    +DEFAULT_SYSTEM_PROMPT = '请优化输入的问答对,使【问题】和【回答】都更加详细、准确。必须按照以下标记格式,直接输出优化后的问答对:\n【问题】\n优化后的问题\n【回答】\n优化后的回答'
    -DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}'
    +DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}'
    -DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
    +DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
    -DEFAULT_OUTPUT_PATTERN = '.*?【问题】\\s*(.*?)\\s*【回答】\\s*(.*)'
    +DEFAULT_OUTPUT_PATTERN = '.*?【问题】\\s*(.*?)\\s*【回答】\\s*(.*)'
    -__init__(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
    +__init__(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1653,17 +1655,17 @@
    -build_input(sample)[source]
    +build_input(sample)[source]
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -1679,61 +1681,61 @@
    -class data_juicer.ops.mapper.OptimizeQueryMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
    +class data_juicer.ops.mapper.OptimizeQueryMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: OptimizeQAMapper

    Mapper to optimize query in question-answer pairs.

    -DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。'
    +DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。'
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -class data_juicer.ops.mapper.OptimizeResponseMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
    +class data_juicer.ops.mapper.OptimizeResponseMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: OptimizeQAMapper

    Mapper to optimize response in question-answer pairs.

    -DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。'
    +DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。'
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -class data_juicer.ops.mapper.PairPreferenceMapper(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, rejected_key: str = 'rejected_response', reason_key: str = 'reason', try_num: int[int] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +class data_juicer.ops.mapper.PairPreferenceMapper(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, rejected_key: str = 'rejected_response', reason_key: str = 'reason', try_num: Annotated[int, Gt(gt=0)] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Bases: Mapper

    Mapper to construct paired preference samples.

    -DEFAULT_SYSTEM_PROMPT = '你的任务是根据参考信息修改问答对中的回答,在语言风格、事实性、人物身份、立场等任一方面与原回答相反。必须按照以下标记格式输出,不要输出其他多余内容。\n【回答】\n生成的新回答\n【原因】\n生成该回答的原因'
    +DEFAULT_SYSTEM_PROMPT = '你的任务是根据参考信息修改问答对中的回答,在语言风格、事实性、人物身份、立场等任一方面与原回答相反。必须按照以下标记格式输出,不要输出其他多余内容。\n【回答】\n生成的新回答\n【原因】\n生成该回答的原因'
    -DEFAULT_INPUT_TEMPLATE = '【参考信息】\n{reference}\n\n以下是原始问答对:\n【问题】\n{query}\n【回答】\n{response}'
    +DEFAULT_INPUT_TEMPLATE = '【参考信息】\n{reference}\n\n以下是原始问答对:\n【问题】\n{query}\n【回答】\n{response}'
    -DEFAULT_OUTPUT_PATTERN = '.*?【回答】\\s*(.*?)\\s*【原因】\\s*(.*)'
    +DEFAULT_OUTPUT_PATTERN = '.*?【回答】\\s*(.*?)\\s*【原因】\\s*(.*)'
    -__init__(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, rejected_key: str = 'rejected_response', reason_key: str = 'reason', try_num: int[int] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]
    +__init__(api_model: str = 'gpt-4o', *, api_endpoint: str | None = None, response_path: str | None = None, system_prompt: str | None = None, input_template: str | None = None, output_pattern: str | None = None, rejected_key: str = 'rejected_response', reason_key: str = 'reason', try_num: Annotated[int, Gt(gt=0)] = 3, model_params: Dict = {}, sampling_params: Dict = {}, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1764,17 +1766,17 @@
    -build_input(sample)[source]
    +build_input(sample)[source]
    -parse_output(raw_output)[source]
    +parse_output(raw_output)[source]
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -1790,13 +1792,13 @@
    -class data_juicer.ops.mapper.PunctuationNormalizationMapper(*args, **kwargs)[source]
    +class data_juicer.ops.mapper.PunctuationNormalizationMapper(*args, **kwargs)[source]

    Bases: Mapper

    Mapper to normalize unicode punctuations to English punctuations in text samples.

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1810,20 +1812,101 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source] +
    + +
    + +
    +
    +class data_juicer.ops.mapper.PythonFileMapper(file_path: str = '', function_name: str = 'process_single', batched: bool = False, **kwargs)[source]
    +

    Bases: Mapper

    +

    Mapper for executing Python function defined in a file.

    +
    +
    +__init__(file_path: str = '', function_name: str = 'process_single', batched: bool = False, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • file_path – The path to the Python file containing the function +to be executed.

    • +
    • function_name – The name of the function defined in the file +to be executed.

    • +
    • batched – A boolean indicating whether to process input data in +batches.

    • +
    • kwargs – Additional keyword arguments passed to the parent class.

    • +
    +
    +
    +
    + +
    +
    +process_single(sample)[source]
    +

    Invoke the loaded function with the provided sample.

    +
    + +
    +
    +process_batched(samples)[source]
    +

    Invoke the loaded function with the provided samples.

    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.PythonLambdaMapper(lambda_str: str = '', batched: bool = False, **kwargs)[source]
    +

    Bases: Mapper

    +

    Mapper for executing Python lambda function on data samples.

    +
    +
    +__init__(lambda_str: str = '', batched: bool = False, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • lambda_str – A string representation of the lambda function to be +executed on data samples. If empty, the identity function is used.

    • +
    • batched – A boolean indicating whether to process input data in +batches.

    • +
    • kwargs – Additional keyword arguments passed to the parent class.

    • +
    +
    +
    +
    + +
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    + +
    +
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveBibliographyMapper(*args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveBibliographyMapper(*args, **kwargs)[source]

    Bases: Mapper

    Mapper to remove bibliography at the end of documents in Latex samples.

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1837,20 +1920,20 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveCommentsMapper(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveCommentsMapper(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to remove comments in different kinds of documents.

    Only support ‘tex’ for now.

    -__init__(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]
    +__init__(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1867,20 +1950,20 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveHeaderMapper(drop_no_head: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveHeaderMapper(drop_no_head: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to remove headers at the beginning of documents in Latex samples.

    -__init__(drop_no_head: bool = True, *args, **kwargs)[source]
    +__init__(drop_no_head: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1896,19 +1979,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveLongWordsMapper(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveLongWordsMapper(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to remove long words within a specific range.

    -__init__(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +__init__(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1926,24 +2009,24 @@
    -should_keep_long_word(word)[source]
    +should_keep_long_word(word)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to remove non chinese Character in text samples.

    -__init__(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]
    +__init__(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1960,19 +2043,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveRepeatSentencesMapper(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveRepeatSentencesMapper(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to remove repeat sentences in text samples.

    -__init__(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]
    +__init__(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -1995,19 +2078,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveSpecificCharsMapper(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveSpecificCharsMapper(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to clean specific chars in text samples.

    -__init__(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]
    +__init__(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2023,21 +2106,21 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveTableTextMapper(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveTableTextMapper(min_col: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=2), Le(le=20)])] = 2, max_col: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=2), Le(le=20)])] = 20, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to remove table texts from text samples.

    Regular expression is used to remove tables in the range of column number of tables.

    -__init__(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]
    +__init__(min_col: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=2), Le(le=20)])] = 2, max_col: Annotated[int, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=2), Le(le=20)])] = 20, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2053,19 +2136,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to remove words with incorrect substrings.

    -__init__(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]
    +__init__(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2082,26 +2165,26 @@
    -should_keep_word_with_incorrect_substrings(word, substrings)[source]
    +should_keep_word_with_incorrect_substrings(word, substrings)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.ReplaceContentMapper(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.ReplaceContentMapper(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to replace all content in the text that matches a specific regular expression pattern with a designated replacement string.

    -__init__(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]
    +__init__(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2117,19 +2200,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.SentenceSplitMapper(lang: str = 'en', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.SentenceSplitMapper(lang: str = 'en', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to split text samples to sentences.

    -__init__(lang: str = 'en', *args, **kwargs)[source]
    +__init__(lang: str = 'en', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2144,19 +2227,19 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.TextChunkMapper(max_len: int[int] | None = None, split_pattern: str | None = '\\n\\n', overlap_len: int[int] = 0, tokenizer: str | None = None, trust_remote_code: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.TextChunkMapper(max_len: Annotated[int, Gt(gt=0)] | None = None, split_pattern: str | None = '\\n\\n', overlap_len: Annotated[int, Ge(ge=0)] = 0, tokenizer: str | None = None, trust_remote_code: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    Split input text to chunks.

    -__init__(max_len: int[int] | None = None, split_pattern: str | None = '\\n\\n', overlap_len: int[int] = 0, tokenizer: str | None = None, trust_remote_code: bool = False, *args, **kwargs)[source]
    +__init__(max_len: Annotated[int, Gt(gt=0)] | None = None, split_pattern: str | None = '\\n\\n', overlap_len: Annotated[int, Ge(ge=0)] = 0, tokenizer: str | None = None, trust_remote_code: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2184,30 +2267,30 @@
    -recursively_chunk(text)[source]
    +recursively_chunk(text)[source]
    -get_text_chunks(text, rank=None)[source]
    +get_text_chunks(text, rank=None)[source]
    -process_batched(samples, rank=None)[source]
    +process_batched(samples, rank=None)[source]
    -class data_juicer.ops.mapper.VideoCaptioningFromAudioMapper(keep_original_sample: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoCaptioningFromAudioMapper(keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to caption a video according to its audio streams based on Qwen-Audio model.

    -__init__(keep_original_sample: bool = True, *args, **kwargs)[source]
    +__init__(keep_original_sample: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2225,21 +2308,21 @@
    -process_batched(samples, rank=None)[source]
    +process_batched(samples, rank=None)[source]
    -class data_juicer.ops.mapper.VideoCaptioningFromFramesMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoCaptioningFromFramesMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: Annotated[int, Gt(gt=0)] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate samples whose captions are generated based on an image-to-text model and sampled video frames. Captions from different frames will be concatenated to a single string.

    -__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
    +__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: Annotated[int, Gt(gt=0)] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2308,10 +2391,10 @@
    -process_batched(samples, rank=None, context=False)[source]
    +process_batched(samples, rank=None, context=False)[source]
    Parameters:
    -

    samples

    +

    samples

    Returns:

    @@ -2332,13 +2415,13 @@
    -class data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: Annotated[int, Gt(gt=0)] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate video captions by summarizing several kinds of generated texts (captions from video/audio/frames, tags from audio/frames, …)

    -__init__(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]
    +__init__(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: Annotated[int, Gt(gt=0)] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2388,20 +2471,20 @@
    -process_batched(samples, rank=None)[source]
    +process_batched(samples, rank=None)[source]
    -class data_juicer.ops.mapper.VideoCaptioningFromVideoMapper(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoCaptioningFromVideoMapper(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: Annotated[int, Gt(gt=0)] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate samples whose captions are generated based on a video-to-text model and sampled video frame.

    -__init__(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
    +__init__(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: Annotated[int, Gt(gt=0)] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2471,10 +2554,10 @@
    -process_batched(samples, rank=None, context=False)[source]
    +process_batched(samples, rank=None, context=False)[source]
    Parameters:
    -

    samples

    +

    samples

    Returns:

    @@ -2495,12 +2578,12 @@
    -class data_juicer.ops.mapper.VideoFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Simple wrapper for FFmpeg video filters.

    -__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2519,7 +2602,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2535,12 +2618,12 @@
    -class data_juicer.ops.mapper.VideoFaceBlurMapper(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoFaceBlurMapper(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to blur faces detected in videos.

    -__init__(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +__init__(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2559,7 +2642,7 @@
    -process_single(sample, context=False)[source]
    +process_single(sample, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2575,12 +2658,12 @@
    -class data_juicer.ops.mapper.VideoRemoveWatermarkMapper(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoRemoveWatermarkMapper(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: Annotated[int, Gt(gt=0)] = 10, min_frame_threshold: Annotated[int, Gt(gt=0)] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]

    Bases: Mapper

    Remove the watermarks in videos given regions.

    -__init__(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +__init__(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: Annotated[int, Gt(gt=0)] = 10, min_frame_threshold: Annotated[int, Gt(gt=0)] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2616,7 +2699,7 @@
    -process_single(sample, context=False)[source]
    +process_single(sample, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2632,18 +2715,18 @@
    -class data_juicer.ops.mapper.VideoResizeAspectRatioMapper(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoResizeAspectRatioMapper(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to resize videos by aspect ratio. AspectRatio = W / H.

    -STRATEGY = ['decrease', 'increase']
    +STRATEGY = ['decrease', 'increase']
    -__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]
    +__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2669,7 +2752,7 @@
    -process_single(sample)[source]
    +process_single(sample)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2685,13 +2768,13 @@
    -class data_juicer.ops.mapper.VideoResizeResolutionMapper(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, force_original_aspect_ratio: str = 'disable', force_divisible_by: int[int] = 2, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoResizeResolutionMapper(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, force_original_aspect_ratio: str = 'disable', force_divisible_by: Annotated[int, Gt(gt=0)] = 2, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to resize videos resolution. We leave the super resolution with deep learning for future works.

    -__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, force_original_aspect_ratio: str = 'disable', force_divisible_by: int[int] = 2, *args, **kwargs)[source]
    +__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, force_original_aspect_ratio: str = 'disable', force_divisible_by: Annotated[int, Gt(gt=0)] = 2, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2715,7 +2798,7 @@
    -process_single(sample, context=False)[source]
    +process_single(sample, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2731,12 +2814,12 @@
    -class data_juicer.ops.mapper.VideoSplitByDurationMapper(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoSplitByDurationMapper(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to split video by duration.

    -__init__(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]
    +__init__(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2758,24 +2841,24 @@
    -split_videos_by_duration(video_key, container)[source]
    +split_videos_by_duration(video_key, container)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.VideoSplitByKeyFrameMapper(keep_original_sample: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoSplitByKeyFrameMapper(keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to split video by key frame.

    -__init__(keep_original_sample: bool = True, *args, **kwargs)[source]
    +__init__(keep_original_sample: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2793,29 +2876,29 @@
    -get_split_key_frame(video_key, container)[source]
    +get_split_key_frame(video_key, container)[source]
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    -class data_juicer.ops.mapper.VideoSplitBySceneMapper(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoSplitBySceneMapper(detector: str = 'ContentDetector', threshold: Annotated[float, Ge(ge=0)] = 27.0, min_scene_len: Annotated[int, Ge(ge=0)] = 15, show_progress: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to cut videos into scene clips.

    -avaliable_detectors = {'AdaptiveDetector': ['window_width', 'min_content_val', 'weights', 'luma_only', 'kernel_size', 'video_manager', 'min_delta_hsv'], 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], 'ThresholdDetector': ['fade_bias', 'add_final_scene', 'method', 'block_size']}
    +avaliable_detectors = {'AdaptiveDetector': ['window_width', 'min_content_val', 'weights', 'luma_only', 'kernel_size', 'video_manager', 'min_delta_hsv'], 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], 'ThresholdDetector': ['fade_bias', 'add_final_scene', 'method', 'block_size']}
    -__init__(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]
    +__init__(detector: str = 'ContentDetector', threshold: Annotated[float, Ge(ge=0)] = 27.0, min_scene_len: Annotated[int, Ge(ge=0)] = 15, show_progress: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2834,7 +2917,7 @@
    -process_single(sample, context=False)[source]
    +process_single(sample, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2850,13 +2933,13 @@
    -class data_juicer.ops.mapper.VideoTaggingFromAudioMapper(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoTaggingFromAudioMapper(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate video tags from audio streams extracted by video using the Audio Spectrogram Transformer.

    -__init__(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]
    +__init__(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2874,7 +2957,7 @@
    -process_single(sample, rank=None)[source]
    +process_single(sample, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2890,12 +2973,12 @@
    -class data_juicer.ops.mapper.VideoTaggingFromFramesMapper(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]
    +class data_juicer.ops.mapper.VideoTaggingFromFramesMapper(frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate video tags from frames extract by video.

    -__init__(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]
    +__init__(frame_sampling_method: str = 'all_keyframes', frame_num: Annotated[int, Gt(gt=0)] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2924,7 +3007,7 @@
    -process_single(sample, rank=None, context=False)[source]
    +process_single(sample, rank=None, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2940,7 +3023,7 @@
    -class data_juicer.ops.mapper.WhitespaceNormalizationMapper(*args, **kwargs)[source]
    +class data_juicer.ops.mapper.WhitespaceNormalizationMapper(*args, **kwargs)[source]

    Bases: Mapper

    Mapper to normalize different kinds of whitespaces to whitespace ‘ ‘ (0x20) in text samples.

    @@ -2948,7 +3031,7 @@ https://en.wikipedia.org/wiki/Whitespace_character

    -__init__(*args, **kwargs)[source]
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -2962,7 +3045,7 @@
    -process_batched(samples)[source]
    +process_batched(samples)[source]
    diff --git a/data_juicer.ops.selector.html b/data_juicer.ops.selector.html index 47f5c7dca..207d94d7f 100644 --- a/data_juicer.ops.selector.html +++ b/data_juicer.ops.selector.html @@ -1,19 +1,19 @@ - + - data_juicer.ops.selector — data_juicer 1.0.0 documentation + data_juicer.ops.selector — data_juicer 1.0.1 documentation - - - + + + @@ -85,16 +85,16 @@
    -

    data_juicer.ops.selector

    +

    data_juicer.ops.selector

    -class data_juicer.ops.selector.FrequencySpecifiedFieldSelector(field_key: str = '', top_ratio: float[float] | None = None, topk: int[int] | None = None, reverse: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.selector.FrequencySpecifiedFieldSelector(field_key: str = '', top_ratio: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, topk: Annotated[int, Gt(gt=0)] | None = None, reverse: bool = True, *args, **kwargs)[source]

    Bases: Selector

    Selector to select samples based on the sorted frequency of specified field.

    -__init__(field_key: str = '', top_ratio: float[float] | None = None, topk: int[int] | None = None, reverse: bool = True, *args, **kwargs)[source]
    +__init__(field_key: str = '', top_ratio: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, topk: Annotated[int, Gt(gt=0)] | None = None, reverse: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -124,7 +124,7 @@
    -process(dataset)[source]
    +process(dataset)[source]

    Dataset –> dataset.

    Parameters:
    @@ -140,12 +140,12 @@
    -class data_juicer.ops.selector.RandomSelector(select_ratio: float[float] | None = None, select_num: int[int] | None = None, *args, **kwargs)[source]
    +class data_juicer.ops.selector.RandomSelector(select_ratio: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, select_num: Annotated[int, Gt(gt=0)] | None = None, *args, **kwargs)[source]

    Bases: Selector

    Selector to random select samples.

    -__init__(select_ratio: float[float] | None = None, select_num: int[int] | None = None, *args, **kwargs)[source]
    +__init__(select_ratio: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, select_num: Annotated[int, Gt(gt=0)] | None = None, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -165,7 +165,7 @@
    -process(dataset)[source]
    +process(dataset)[source]

    Dataset –> dataset.

    Parameters:
    @@ -181,13 +181,13 @@
    -class data_juicer.ops.selector.RangeSpecifiedFieldSelector(field_key: str = '', lower_percentile: float[float] | None = None, upper_percentile: float[float] | None = None, lower_rank: int[int] | None = None, upper_rank: int[int] | None = None, *args, **kwargs)[source]
    +class data_juicer.ops.selector.RangeSpecifiedFieldSelector(field_key: str = '', lower_percentile: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, upper_percentile: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, lower_rank: Annotated[int, Gt(gt=0)] | None = None, upper_rank: Annotated[int, Gt(gt=0)] | None = None, *args, **kwargs)[source]

    Bases: Selector

    Selector to select a range of samples based on the sorted specified field value from smallest to largest.

    -__init__(field_key: str = '', lower_percentile: float[float] | None = None, upper_percentile: float[float] | None = None, lower_rank: int[int] | None = None, upper_rank: int[int] | None = None, *args, **kwargs)[source]
    +__init__(field_key: str = '', lower_percentile: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, upper_percentile: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, lower_rank: Annotated[int, Gt(gt=0)] | None = None, upper_rank: Annotated[int, Gt(gt=0)] | None = None, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -225,7 +225,7 @@
    -process(dataset)[source]
    +process(dataset)[source]

    Dataset –> dataset.

    Parameters:
    @@ -241,13 +241,13 @@
    -class data_juicer.ops.selector.TopkSpecifiedFieldSelector(field_key: str = '', top_ratio: float[float] | None = None, topk: int[int] | None = None, reverse: bool = True, *args, **kwargs)[source]
    +class data_juicer.ops.selector.TopkSpecifiedFieldSelector(field_key: str = '', top_ratio: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, topk: Annotated[int, Gt(gt=0)] | None = None, reverse: bool = True, *args, **kwargs)[source]

    Bases: Selector

    Selector to select top samples based on the sorted specified field value.

    -__init__(field_key: str = '', top_ratio: float[float] | None = None, topk: int[int] | None = None, reverse: bool = True, *args, **kwargs)[source]
    +__init__(field_key: str = '', top_ratio: Annotated[float, FieldInfo(annotation=NoneType, required=True, metadata=[Ge(ge=0), Le(le=1)])] | None = None, topk: Annotated[int, Gt(gt=0)] | None = None, reverse: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -277,7 +277,7 @@
    -process(dataset)[source]
    +process(dataset)[source]

    Dataset –> dataset.

    Parameters:
    diff --git a/data_juicer.tools.html b/data_juicer.tools.html index 003749672..d0e5b6c4d 100644 --- a/data_juicer.tools.html +++ b/data_juicer.tools.html @@ -1,19 +1,19 @@ - + - data_juicer.tools — data_juicer 1.0.0 documentation + data_juicer.tools — data_juicer 1.0.1 documentation - - - + + + @@ -77,7 +77,7 @@
    -

    data_juicer.tools

    +

    data_juicer.tools

    diff --git a/data_juicer.utils.html b/data_juicer.utils.html index adbb96403..b38bce16c 100644 --- a/data_juicer.utils.html +++ b/data_juicer.utils.html @@ -1,19 +1,19 @@ - + - data_juicer.utils — data_juicer 1.0.0 documentation + data_juicer.utils — data_juicer 1.0.1 documentation - - - + + + @@ -77,7 +77,7 @@
    -

    data_juicer.utils

    +

    data_juicer.utils

    diff --git a/genindex.html b/genindex.html index b5cbdb68d..997bba9ca 100644 --- a/genindex.html +++ b/genindex.html @@ -1,18 +1,18 @@ - + - Index — data_juicer 1.0.0 documentation + Index — data_juicer 1.0.1 documentation - - - + + + @@ -97,6 +97,7 @@

    Index

    | R | S | T + | U | V | W @@ -312,6 +313,10 @@

    _

  • (data_juicer.ops.mapper.PairPreferenceMapper method)
  • (data_juicer.ops.mapper.PunctuationNormalizationMapper method) +
  • +
  • (data_juicer.ops.mapper.PythonFileMapper method) +
  • +
  • (data_juicer.ops.mapper.PythonLambdaMapper method)
  • (data_juicer.ops.mapper.RemoveBibliographyMapper method)
  • @@ -1228,6 +1233,10 @@

    P

  • (data_juicer.ops.mapper.NlpcdaZhMapper method)
  • (data_juicer.ops.mapper.PunctuationNormalizationMapper method) +
  • +
  • (data_juicer.ops.mapper.PythonFileMapper method) +
  • +
  • (data_juicer.ops.mapper.PythonLambdaMapper method)
  • (data_juicer.ops.mapper.RemoveBibliographyMapper method)
  • @@ -1366,6 +1375,10 @@

    P

  • (data_juicer.ops.mapper.OptimizeQAMapper method)
  • (data_juicer.ops.mapper.PairPreferenceMapper method) +
  • +
  • (data_juicer.ops.mapper.PythonFileMapper method) +
  • +
  • (data_juicer.ops.mapper.PythonLambdaMapper method)
  • (data_juicer.ops.mapper.VideoFaceBlurMapper method)
  • @@ -1385,6 +1398,10 @@

    P

  • PunctuationNormalizationMapper (class in data_juicer.ops.mapper) +
  • +
  • PythonFileMapper (class in data_juicer.ops.mapper) +
  • +
  • PythonLambdaMapper (class in data_juicer.ops.mapper)
  • @@ -1565,6 +1582,14 @@

    T

    +

    U

    + + +
    +

    V

      diff --git a/index.html b/index.html index 4d8731da7..3dc6fd2ca 100644 --- a/index.html +++ b/index.html @@ -1,19 +1,19 @@ - + - Welcome to data-juicer’s documentation! — data_juicer 1.0.0 documentation + Welcome to data-juicer’s documentation! — data_juicer 1.0.1 documentation - - - + + + @@ -78,9 +78,9 @@
      -

      Welcome to data-juicer’s documentation!

      +

      Welcome to data-juicer’s documentation!

      -

      Tutorial

      +

      Tutorial

      We will give a tutorial on KDD’24, Multi-modal Data Processing for Foundation Models: Practical Guidances and Use Cases, see more details here!

      API Reference

      @@ -183,6 +183,8 @@

      TutorialOptimizeResponseMapper
    • PairPreferenceMapper
    • PunctuationNormalizationMapper
    • +
    • PythonFileMapper
    • +
    • PythonLambdaMapper
    • RemoveBibliographyMapper
    • RemoveCommentsMapper
    • RemoveHeaderMapper
    • @@ -276,7 +278,7 @@

      Tutorial -

      Indices and Tables

      +

      Indices and Tables

      • Index

      • Module Index

      • diff --git a/modules.html b/modules.html index 02c0b03e2..c766806da 100644 --- a/modules.html +++ b/modules.html @@ -1,19 +1,19 @@ - + - data_juicer — data_juicer 1.0.0 documentation + data_juicer — data_juicer 1.0.1 documentation - - - + + + @@ -77,7 +77,7 @@
        -

        data_juicer

        +

        data_juicer

        • data_juicer
            diff --git a/objects.inv b/objects.inv index d1152f900..51714d370 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/py-modindex.html b/py-modindex.html index 724f53678..af4c0a97a 100644 --- a/py-modindex.html +++ b/py-modindex.html @@ -1,18 +1,18 @@ - + - Python Module Index — data_juicer 1.0.0 documentation + Python Module Index — data_juicer 1.0.1 documentation - - - + + + diff --git a/search.html b/search.html index 47bd1514f..14a698521 100644 --- a/search.html +++ b/search.html @@ -1,19 +1,19 @@ - + - Search — data_juicer 1.0.0 documentation + Search — data_juicer 1.0.1 documentation - - - + + + diff --git a/searchindex.js b/searchindex.js index bf6203507..b43debacb 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7, 8, 9], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4, 9], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 3, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 3, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 3, 7, 8, 9], "inform": [1, 3, 5, 7, 8, 9, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 6, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 6, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8, 9], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "list": [2, 3, 4, 5, 6, 8, 9], "str": [2, 3, 4, 6, 7, 8, 9, 10], "jsonargpars": [2, 3], "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": [2, 9], "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": [2, 3], "variabl": 2, "hard": 2, "code": [2, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "get_init_config": [2, 13], "namespac": [2, 3], "dict": [2, 3, 9], "set": [2, 3, 6, 8, 9, 10], "init": 2, "datajuc": 2, "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "bool": [2, 3, 7, 8, 9, 10], "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "type": [2, 3, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 3, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": [2, 3], "check": [2, 9], "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "merg": [2, 4, 6, 8], "configur": [2, 3, 4, 9], "origin": [2, 3, 8, 9], "expect": [2, 3, 9], "cfg_after_merg": 2, "prepare_side_config": [2, 13], "ori_config": 2, "string": [2, 7, 8, 9], "yml": 2, "adapt": [3, 13], "max_batch_s": 3, "10000": 3, "static": 3, "execute_and_prob": 3, "oper": [3, 8], "sample_interv": 3, "0": [3, 4, 5, 7, 8, 9], "5": [3, 7, 8, 9], "input": [3, 5, 7, 8, 9, 10], "probe": 3, "relat": [3, 8, 9], "op": [3, 13], "specifi": [3, 4, 6, 8, 9, 10], "For": [3, 5, 7, 8, 9], "now": [3, 6, 9], "we": [3, 4, 7, 8, 9, 13], "support": [3, 8, 9], "follow": [3, 9], "target": [3, 8, 9, 10], "resourc": 3, "util": [3, 8], "speed": 3, "averag": [3, 8], "The": [3, 4, 5, 8, 9, 10], "item": [3, 5, 9], "take_batch": 3, "config": [3, 5, 9, 13], "split": [3, 6, 9], "batch": [3, 9], "factor": 3, "size": [3, 6, 7, 8, 9], "iter": [3, 8, 9], "adapt_workload": 3, "manag": [3, 9], "schedul": 3, "balanc": 3, "need": [3, 6, 8, 9, 10], "recip": 3, "probe_small_batch": 3, "perform": 3, "small": [3, 8, 9], "pre": [3, 9], "execut": 3, "avail": [3, 8], "current": 3, "estim": 3, "rank": [3, 8, 9, 10], "notic": [3, 9], "should": [3, 7, 8, 9], "run": [3, 5, 8, 9], "cach": [3, 8], "enabl": [3, 9], "A": [3, 5, 7, 9], "length": [3, 4, 8, 9], "batch_size_strategi": 3, "load_analysis_r": 3, "base_b": 3, "util_th": 3, "9": [3, 8, 9], "decid": [3, 5, 7, 8], "accord": [3, 4, 5, 8, 9], "workload": 3, "analysi": [3, 13], "threshold": [3, 7, 8, 9], "guarante": 3, "won": [3, 7], "t": [3, 4, 6, 7], "exce": [3, 8, 9], "onli": [3, 7, 8, 9], "consid": [3, 7, 8, 9], "bucket": 3, "effect": 3, "which": [3, 5, 7, 8, 9], "max": [3, 4, 7, 8, 9], "except": [3, 9], "gpu": 3, "thi": [3, 4, 5, 6, 7, 8, 9, 10], "It": [3, 4, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": [3, 8], "load_data_np": 3, "int": [3, 4, 7, 8, 9, 10], "skip_return": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "api": [3, 9], "call": [3, 9], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "work_dir": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 9, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": [3, 9], "add_column": 3, "add": [3, 4, 9], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": [3, 9], "compress": 3, "load_from_disk": 3, "wa": [3, 9], "previous": 3, "save_to_disk": 3, "directori": [3, 4, 8], "filesystem": 3, "ani": [3, 6, 8, 9], "implement": [3, 7], "fsspec": 3, "spec": 3, "abstractfilesystem": 3, "dataset_path": [3, 4], "like": [3, 6, 7, 8, 9], "train": [3, 9], "remot": [3, 9], "uri": 3, "s3": 3, "my": 3, "where": 3, "keep_in_memori": 3, "copi": 3, "memori": 3, "unless": 3, "explicitli": 3, "in_memory_max_s": 3, "nonzero": 3, "see": [3, 13], "more": [3, 8, 9, 13], "detail": [3, 8, 9, 13], "improv": 3, "section": 3, "storage_opt": 3, "kei": [3, 4, 5, 8, 9, 10], "pair": [3, 5, 7, 8, 9], "pass": [3, 9], "system": [3, 9], "backend": 3, "ad": [3, 6, 9], "version": [3, 9], "2": [3, 6, 8, 9], "8": [3, 8, 9], "request": [3, 9], "datasetdict": 3, "exampl": [3, 8, 9], "py": [3, 4], "d": [3, 4, 9], "unifi": [3, 4], "order": [3, 9, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 7, 8, 9, 10], "sample_algo": 3, "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "ratio": [3, 4, 6, 8, 9, 10], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "to_jsonl": 3, "jsonl": [3, 4], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "monitor": [3, 13], "other": [3, 8, 9], "dure": 3, "python": 3, "time": [3, 9], "10": [3, 8, 9], "interv": 3, "timestamp": 3, "xxx": 3, "cpu": 3, "count": [3, 8], "free": 3, "mem": 3, "structur": 3, "abov": [3, 9], "field": [3, 4, 5, 7, 8, 9, 10], "first": [3, 6, 7, 8, 9], "level": [3, 5, 6, 7, 8, 9, 10], "resource_analysi": 3, "min": [3, 7, 8, 9], "avg": [3, 8], "those": [3, 8, 9], "dynamic_field": 3, "monitor_all_resourc": 3, "detect": [3, 7, 8, 9], "node": 3, "monitor_current_resourc": 3, "machin": 3, "rang": [3, 8, 9, 10], "mb": [3, 8], "draw_resource_util_graph": 3, "resource_util_list": 3, "store_dir": 3, "analyze_resource_util_list": 3, "metric": [3, 5, 7, 8], "analyze_single_resource_util": 3, "resource_util_dict": 3, "monitor_func": 3, "show_num": [3, 5, 7], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "two": [3, 7, 8, 9], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "generated_dataset_config": [4, 9], "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "creat": 4, "provid": [4, 7, 9], "must": [4, 8, 9], "contain": [4, 6, 8, 9], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 8, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": [4, 8, 9], "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": [4, 9], "mixtureformatt": [4, 13], "max_sampl": 4, "mix": [4, 9], "randomli": [4, 9], "everi": 4, "them": [4, 7, 8, 9], "datasset": 4, "dir": 4, "w1": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "random_sampl": 4, "sample_numb": 4, "seed": [4, 9], "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "instead": [4, 6], "random": [4, 9, 10], "42": 4, "emptyformatt": [4, 9, 13], "feature_kei": [4, 9], "empti": [4, 7, 9], "featur": 4, "properti": 4, "null_valu": 4, "rayemptyformatt": [4, 9, 13], "rai": [4, 7, 9], "load_op": [5, 13], "process_list": 5, "instanc": 5, "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stats_batch": [5, 8], "process_batch": [5, 8, 9], "compute_stats_singl": [5, 7, 8], "context": [5, 7, 8, 9], "intermedi": [5, 7, 8], "var": [5, 7, 8], "temporarili": [5, 7, 8], "process_singl": [5, 7, 8, 9], "boolean": [5, 7, 8], "reduc": [5, 8, 9], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "splite": 6, "separ": [6, 8, 9, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "split_text_by_punctu": [6, 13], "zh": [6, 8], "punctuat": [6, 7, 9], "documentdedupl": [7, 13], "ignore_non_charact": 7, "exact": 7, "match": [7, 8, 9], "md5": 7, "ignor": [7, 9], "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentminhashdedupl": [7, 13], "window_s": 7, "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": [7, 9], "kept": [7, 8, 9], "final": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "pleas": [7, 9], "shingl": 7, "pattern": [7, 9], "permut": 7, "jaccard": 7, "similar": [7, 8, 9], "regard": [7, 9], "band": 7, "lsh": 7, "determin": [7, 9, 10], "optim": [7, 9], "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "documentsimhashdedupl": [7, 13], "6": [7, 8, 9], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": [7, 9], "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "imagededupl": [7, 13], "phash": 7, "consider_text": 7, "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "6380": 7, "basic": 7, "although": 7, "empty_hash_valu": 7, "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8, 9], "raydocumentdedupl": [7, 13], "rayimagededupl": [7, 13], "rayvideodedupl": [7, 13], "videodedupl": [7, 13], "alphanumericfilt": [8, 13], "min_ratio": [8, 9], "25": 8, "max_ratio": [8, 9], "9223372036854775807": [8, 9], "numer": [8, 9], "within": [8, 9, 10], "alphanumer": 8, "total": [8, 9], "below": [8, 9], "audiodurationfilt": [8, 13], "min_dur": 8, "max_dur": 8, "any_or_al": [8, 9], "durat": [8, 9], "second": [8, 9], "sy": 8, "maxsiz": 8, "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "averagelinelengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "flaggedwordfilt": [8, 13], "lang": [8, 9], "045": 8, "flagged_words_dir": 8, "home": 8, "runner": 8, "asset": 8, "flag": 8, "what": [8, 9], "adopt": 8, "flagged_word": 8, "join": 8, "imageaestheticsfilt": [8, 13], "hf_scorer_model": 8, "trust_remote_cod": [8, 9], "min_scor": 8, "max_scor": 8, "aesthet": 8, "score": [8, 9], "predictor": 8, "By": [8, 9], "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "keyword": [8, 9], "imageaspectratiofilt": [8, 13], "333": 8, "3": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "imagefacecountfilt": [8, 13], "cv_classifi": [8, 9], "min_face_count": 8, "max_face_count": 8, "face": [8, 9], "opencv": [8, 9], "classifi": [8, 9], "haarcascade_frontalface_alt": [8, 9], "minimum": [8, 9], "requir": 8, "imagefaceratiofilt": [8, 13], "area": 8, "largest": [8, 10], "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": [8, 9], "low": 8, "nsfw": 8, "imagepairsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "closedunitinterv": 8, "imageshapefilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "shape": 8, "width": [8, 9], "height": [8, 9], "imagesizefilt": [8, 13], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "reduce_mod": 8, "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "mode": [8, 9], "correspond": [8, 9, 10], "chunk": [8, 9], "take": 8, "imagetextsimilarityfilt": [8, 13], "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "watermark": [8, 9], "high": [8, 9], "probabl": [8, 9], "languageidscorefilt": [8, 13], "confid": 8, "larger": [8, 9, 10], "identif": 8, "maximumlinelengthfilt": [8, 13], "perplexityfilt": [8, 13], "max_ppl": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": [8, 9], "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": [8, 9], "out": 8, "larg": 8, "account": 8, "specialcharactersfilt": [8, 13], "specifiedfieldfilt": [8, 13], "field_kei": [8, 10], "target_valu": 8, "multi": [8, 9, 10, 13], "retain": [8, 9], "specifiednumericfieldfilt": [8, 13], "min_valu": 8, "max_valu": 8, "specifiednumericfield": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "suffixfilt": [8, 13], "textactionfilt": [8, 13], "min_action_num": 8, "action": [8, 9], "mini_action_num": 8, "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": [8, 9], "omit": 8, "mini_dependency_num": 8, "edg": [8, 9], "depend": [8, 9], "objet": 8, "textlengthfilt": [8, 13], "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "hug": [8, 9], "videoaestheticsfilt": [8, 13], "frame_sampling_method": [8, 9], "frame_num": [8, 9], "frame": [8, 9], "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "while": 8, "usual": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "addit": [8, 9], "videoaspectratiofilt": [8, 13], "21": [8, 9], "videodurationfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "chineseclip": 8, "might": [8, 9], "choic": [8, 9], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "tupl": 8, "divis": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "sequenc": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "equal": [8, 9, 10], "As": 8, "mai": [8, 9], "shorter": [8, 9], "dimens": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "setup_model": 8, "compute_flow": 8, "prev_fram": 8, "curr_fram": 8, "videomotionscoreraftfilt": [8, 13], "raft": 8, "recurr": 8, "transform": [8, 9], "torchvis": 8, "further": 8, "offici": 8, "http": [8, 9], "pytorch": 8, "vision": [8, 9], "main": [8, 9], "paper": 8, "here": [8, 9, 13], "arxiv": 8, "ab": 8, "2003": 8, "12039": 8, "videonsfwfilt": [8, 13], "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "found": [8, 9], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videoresolutionfilt": [8, 13], "resolut": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "tag_field_nam": [8, 9], "__dj__video_frame_tags__": [8, 9], "shift": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "videowatermarkfilt": [8, 13], "wordrepetitionfilt": [8, 13], "wordsnumfilt": [8, 13], "audioffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "output": 9, "calibrateqamapp": [9, 13], "api_model": 9, "gpt": 9, "4o": 9, "api_endpoint": 9, "response_path": 9, "system_prompt": 9, "input_templ": 9, "reference_templ": 9, "qa_pair_templ": 9, "output_pattern": 9, "try_num": 9, "model_param": 9, "sampling_param": 9, "calibr": 9, "question": 9, "answer": 9, "default_system_prompt": 9, "\u8bf7\u6839\u636e\u63d0\u4f9b\u7684": 9, "\u53c2\u8003\u4fe1\u606f": 9, "\u5bf9": 9, "\u95ee\u9898": 9, "\u548c": 9, "\u56de\u7b54": 9, "\u8fdb\u884c\u6821\u51c6": 9, "\u4f7f\u5176\u66f4\u52a0\u8be6\u7ec6": 9, "\u51c6\u786e": 9, "n\u6309\u7167\u4ee5\u4e0b\u683c\u5f0f\u8f93\u51fa": 9, "n\u6821\u51c6\u540e\u7684\u95ee\u9898": 9, "n\u6821\u51c6\u540e\u7684\u56de\u7b54": 9, "default_input_templ": 9, "qa_pair": 9, "default_reference_templ": 9, "default_qa_pair_templ": 9, "default_output_pattern": 9, "url": 9, "endpoint": 9, "respons": 9, "messag": 9, "prompt": 9, "task": 9, "templat": 9, "build": 9, "regular": 9, "express": 9, "temperatur": 9, "top_p": 9, "build_input": 9, "parse_output": 9, "raw_output": 9, "calibratequerymapp": [9, 13], "queri": 9, "\u5bf9\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u4e14\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u6821\u51c6\u540e\u7684\u95ee\u9898": 9, "\u4e0d\u8981\u8f93\u51fa\u591a\u4f59\u5185\u5bb9": 9, "calibrateresponsemapp": [9, 13], "\u4e14\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u53ea\u8f93\u51fa\u6821\u51c6\u540e\u7684\u56de\u7b54": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "cleancopyrightmapp": [9, 13], "clean": 9, "copyright": 9, "comment": 9, "begin": 9, "cleanemailmapp": [9, 13], "repl": 9, "email": 9, "search": [9, 13], "replac": 9, "cleanhtmlmapp": [9, 13], "cleanipmapp": [9, 13], "ipv4": 9, "ipv6": 9, "address": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "latex": 9, "extractentityattributemapp": [9, 13], "query_ent": 9, "query_attribut": 9, "entity_kei": 9, "__dj__main_entity__": 9, "attribute_kei": 9, "__dj__attribute__": 9, "attribute_desc_kei": 9, "__dj__attribute_description__": 9, "support_text_kei": 9, "__dj__attribute_support_text__": 9, "system_prompt_templ": 9, "attr_pattern_templ": 9, "demo_pattern": 9, "drop_text": 9, "attribut": 9, "default_system_prompt_templ": 9, "\u7ed9\u5b9a\u4e00\u6bb5\u6587\u672c": 9, "\u4ece\u6587\u672c\u4e2d\u603b\u7ed3": 9, "\u7684": 9, "\u5e76\u4e14\u4ece\u539f\u6587\u6458\u5f55\u6700\u80fd\u8bf4\u660e\u8be5": 9, "\u7684\u4ee3\u8868\u6027\u793a\u4f8b": 9, "n\u8981\u6c42": 9, "\u6458\u5f55\u7684\u793a\u4f8b\u5e94\u8be5\u7b80\u77ed": 9, "\u9075\u5faa\u5982\u4e0b\u7684\u56de\u590d\u683c\u5f0f": 9, "\u63cf\u8ff0": 9, "\u4ee3\u8868\u6027\u793a\u4f8b1": 9, "n\u8bf4\u660e": 9, "\u8be5": 9, "\u7684\u539f\u6587\u6458\u5f551": 9, "\u4ee3\u8868\u6027\u793a\u4f8b2": 9, "\u7684\u539f\u6587\u6458\u5f552": 9, "\u6587\u672c": 9, "default_attr_pattern_templ": 9, "z": 9, "default_demon_pattern": 9, "\u4ee3\u8868\u6027\u793a\u4f8b": 9, "__dj__entity__": 9, "entity_attribute_kei": 9, "descript": 9, "__dj__support_text__": 9, "retri": 9, "attempt": 9, "error": 9, "drop": 9, "demonstract": 9, "attribute_nam": 9, "extractentityrelationmapp": [9, 13], "entity_typ": 9, "relation_kei": 9, "__dj__relation__": 9, "prompt_templ": 9, "tuple_delimit": 9, "record_delimit": 9, "completion_delimit": 9, "max_glean": 9, "continue_prompt": 9, "if_loop_prompt": 9, "entity_pattern": 9, "relation_pattern": 9, "knowledg": 9, "graph": 9, "default_prompt_templ": 9, "goal": 9, "ngiven": 9, "potenti": 9, "relev": 9, "activ": 9, "relationship": 9, "among": 9, "step": 9, "n1": 9, "entity_nam": 9, "One": 9, "entity_descript": 9, "comprehens": 9, "nformat": 9, "n2": 9, "source_ent": 9, "target_ent": 9, "clearli": 9, "nfor": 9, "relationship_descript": 9, "explan": 9, "why": 9, "you": 9, "think": 9, "relationship_strength": 9, "strength": 9, "relationship_keyword": 9, "summar": 9, "overarch": 9, "natur": 9, "focus": 9, "concept": 9, "theme": 9, "rather": 9, "n3": 9, "n4": 9, "finish": 9, "nexampl": 9, "nentity_typ": 9, "person": 9, "technologi": 9, "mission": 9, "organ": 9, "ntext": 9, "nwhile": 9, "alex": 9, "clench": 9, "hi": 9, "jaw": 9, "buzz": 9, "frustrat": 9, "dull": 9, "against": 9, "backdrop": 9, "taylor": 9, "authoritarian": 9, "certainti": 9, "competit": 9, "undercurr": 9, "him": 9, "alert": 9, "sens": 9, "jordan": 9, "share": 9, "commit": 9, "discoveri": 9, "unspoken": 9, "rebellion": 9, "cruz": 9, "narrow": 9, "control": 9, "nthen": 9, "did": 9, "someth": 9, "unexpect": 9, "paus": 9, "besid": 9, "moment": 9, "observ": 9, "devic": 9, "akin": 9, "rever": 9, "tech": 9, "understood": 9, "said": 9, "voic": 9, "quieter": 9, "could": 9, "game": 9, "u": 9, "nthe": 9, "underli": 9, "dismiss": 9, "earlier": 9, "seem": 9, "falter": 9, "glimps": 9, "reluct": 9, "graviti": 9, "lai": 9, "hand": 9, "look": 9, "up": 9, "fleet": 9, "heartbeat": 9, "ey": 9, "lock": 9, "wordless": 9, "clash": 9, "wills": 9, "soften": 9, "uneasi": 9, "truce": 9, "nit": 9, "bare": 9, "percept": 9, "note": 9, "inward": 9, "nod": 9, "had": 9, "been": 9, "brought": 9, "noutput": 9, "who": 9, "experi": 9, "dynam": 9, "portrai": 9, "toward": 9, "perspect": 9, "ha": 9, "signific": 9, "interact": 9, "associ": 9, "influenc": 9, "central": 9, "stori": 9, "implic": 9, "affect": 9, "attitud": 9, "power": 9, "contrast": 9, "directli": 9, "lead": 9, "mutual": 9, "conflict": 9, "ideolog": 9, "import": 9, "impact": 9, "technolog": 9, "\u4eba\u7269": 9, "\u6280\u672f": 9, "\u4efb\u52a1": 9, "\u7ec4\u7ec7": 9, "\u5730\u70b9": 9, "n\u4ed6\u4eec\u4e0d\u518d\u662f\u5355\u7eaf\u7684\u6267\u884c\u8005": 9, "\u4ed6\u4eec\u5df2\u6210\u4e3a\u67d0\u4e2a\u8d85\u8d8a\u661f\u8fb0\u4e0e\u6761\u7eb9\u7684\u9886\u57df\u7684\u4fe1\u606f\u5b88\u62a4\u8005": 9, "\u8fd9\u4e00\u4f7f\u547d\u7684\u63d0\u5347\u4e0d\u80fd\u88ab\u89c4\u5219\u548c\u65e2\u5b9a\u534f\u8bae\u6240\u675f\u7f1a": 9, "\u5b83\u9700\u8981\u4e00\u79cd\u65b0\u7684\u89c6\u89d2": 9, "\u4e00\u79cd\u65b0\u7684\u51b3\u5fc3": 9, "n\u968f\u7740\u4e0e\u534e\u76db\u987f\u7684\u901a\u8baf\u5728\u80cc\u666f\u4e2d\u55e1\u55e1\u4f5c\u54cd": 9, "\u5bf9\u8bdd\u4e2d\u7684\u7d27\u5f20\u60c5\u7eea\u901a\u8fc7\u561f\u561f\u58f0\u548c\u9759\u7535\u566a\u97f3\u8d2f\u7a7f\u59cb\u7ec8": 9, "\u56e2\u961f\u7ad9\u7acb\u7740": 9, "\u4e00\u80a1\u4e0d\u7965\u7684\u6c14\u606f\u7b3c\u7f69\u7740\u4ed6\u4eec": 9, "\u663e\u7136": 9, "\u4ed6\u4eec\u5728\u63a5\u4e0b\u6765\u51e0\u4e2a\u5c0f\u65f6\u5185\u505a\u51fa\u7684\u51b3\u5b9a\u53ef\u80fd\u4f1a\u91cd\u65b0\u5b9a\u4e49\u4eba\u7c7b\u5728\u5b87\u5b99\u4e2d\u7684\u4f4d\u7f6e": 9, "\u6216\u8005\u5c06\u4ed6\u4eec\u7f6e\u4e8e\u65e0\u77e5\u548c\u6f5c\u5728\u5371\u9669\u4e4b\u4e2d": 9, "n\u968f\u7740\u4e0e\u661f\u8fb0\u7684\u8054\u7cfb\u53d8\u5f97\u66f4\u52a0\u7262\u56fa": 9, "\u5c0f\u7ec4\u5f00\u59cb\u5904\u7406\u9010\u6e10\u6210\u5f62\u7684\u8b66\u544a": 9, "\u4ece\u88ab\u52a8\u63a5\u53d7\u8005\u8f6c\u53d8\u4e3a\u79ef\u6781\u53c2\u4e0e\u8005": 9, "\u6885\u745f\u540e\u6765\u7684\u76f4\u89c9\u5360\u636e\u4e86\u4e0a\u98ce": 9, "\u56e2\u961f\u7684\u4efb\u52a1\u5df2\u7ecf\u6f14\u53d8": 9, "\u4e0d\u518d\u4ec5\u4ec5\u662f\u89c2\u5bdf\u548c\u62a5\u544a": 9, "\u800c\u662f\u4e92\u52a8\u548c\u51c6\u5907": 9, "\u4e00\u573a\u8715\u53d8\u5df2\u7ecf\u5f00\u59cb": 9, "\u800c": 9, "\u675c\u5c14\u585e\u884c\u52a8": 9, "\u5219\u4ee5\u4ed6\u4eec\u5927\u80c6\u7684\u65b0\u9891\u7387\u9707\u52a8": 9, "\u8fd9\u79cd\u57fa\u8c03\u4e0d\u662f\u7531\u4e16\u4fd7\u8bbe\u5b9a\u7684": 9, "\u534e\u76db\u987f": 9, "\u534e\u76db\u987f\u662f\u6b63\u5728\u63a5\u6536\u901a\u8baf\u7684\u5730\u65b9": 9, "\u8868\u660e\u5176\u5728\u51b3\u7b56\u8fc7\u7a0b\u4e2d\u7684\u91cd\u8981\u6027": 9, "\u675c\u5c14\u585e\u884c\u52a8\u88ab\u63cf\u8ff0\u4e3a\u4e00\u9879\u5df2\u6f14\u53d8\u4e3a\u4e92\u52a8\u548c\u51c6\u5907\u7684\u4efb\u52a1": 9, "\u663e\u793a\u51fa\u76ee\u6807\u548c\u6d3b\u52a8\u7684\u91cd\u5927\u8f6c\u53d8": 9, "\u56e2\u961f": 9, "\u56e2\u961f\u88ab\u63cf\u7ed8\u6210\u4e00\u7fa4\u4ece\u88ab\u52a8\u89c2\u5bdf\u8005\u8f6c\u53d8\u4e3a\u79ef\u6781\u53c2\u4e0e\u8005\u7684\u4eba": 9, "\u5c55\u793a\u4e86\u4ed6\u4eec\u89d2\u8272\u7684\u52a8\u6001\u53d8\u5316": 9, "\u56e2\u961f\u6536\u5230\u6765\u81ea\u534e\u76db\u987f\u7684\u901a\u8baf": 9, "\u8fd9\u5f71\u54cd\u4e86\u4ed6\u4eec\u7684\u51b3\u7b56\u8fc7\u7a0b": 9, "\u51b3\u7b56": 9, "\u5916\u90e8\u5f71\u54cd": 9, "\u56e2\u961f\u76f4\u63a5\u53c2\u4e0e\u675c\u5c14\u585e\u884c\u52a8": 9, "\u6267\u884c\u5176\u6f14\u53d8\u540e\u7684\u76ee\u6807\u548c\u6d3b\u52a8": 9, "\u4efb\u52a1\u6f14\u53d8": 9, "\u79ef\u6781\u53c2\u4e0e": 9, "role": 9, "event": 9, "ntheir": 9, "slice": 9, "through": 9, "illus": 9, "intellig": 9, "liter": 9, "write": 9, "own": 9, "rule": [9, 10], "state": 9, "stoical": 9, "cast": 9, "watch": 9, "over": 9, "flurri": 9, "learn": 9, "commun": 9, "offer": 9, "sam": 9, "rivera": 9, "nearbi": 9, "interfac": 9, "youth": 9, "energi": 9, "bode": 9, "aw": 9, "anxieti": 9, "give": [9, 13], "talk": 9, "stranger": 9, "nalex": 9, "survei": 9, "team": 9, "studi": 9, "concentr": 9, "measur": 9, "trepid": 9, "well": 9, "our": 9, "contact": 9, "he": 9, "acknowledg": 9, "readi": 9, "whatev": 9, "back": 9, "ntogeth": 9, "stood": 9, "unknown": 9, "forg": 9, "human": 9, "heaven": 9, "ensu": 9, "silenc": 9, "palpabl": 9, "collect": 9, "introspect": 9, "about": 9, "grand": 9, "cosmic": 9, "plai": 9, "rewrit": 9, "histori": 9, "encrypt": 9, "dialogu": 9, "continu": 9, "unfold": 9, "intric": 9, "almost": 9, "uncanni": 9, "anticip": 9, "member": 9, "leader": 9, "abil": 9, "govern": 9, "challeng": 9, "capabl": 9, "taken": 9, "involv": 9, "make": 9, "leadership": 9, "explor": 9, "autonomi": 9, "real": 9, "input_text": 9, "default_continue_prompt": 9, "mani": 9, "were": 9, "miss": 9, "same": 9, "default_if_loop_prompt": 9, "appear": 9, "still": 9, "ye": 9, "NO": 9, "default_entity_typ": 9, "geo": 9, "default_tuple_delimit": 9, "default_record_delimit": 9, "default_completion_delimit": 9, "complet": 9, "default_entity_pattern": 9, "default_relation_pattern": 9, "defin": 9, "record": 9, "To": 9, "mark": 9, "end": 9, "num": 9, "llm": 9, "glean": 9, "stop": 9, "add_messag": 9, "light_rag_extract": 9, "extracteventmapp": [9, 13], "event_desc_kei": 9, "__dj__event_description__": 9, "relevant_char_kei": 9, "__dj__relevant_characters__": 9, "\u5bf9\u6587\u672c\u7684\u60c5\u8282\u8fdb\u884c\u5206\u70b9\u603b\u7ed3": 9, "\u5e76\u62bd\u53d6\u4e0e\u60c5\u8282\u76f8\u5173\u7684\u4eba\u7269": 9, "\u5c3d\u91cf\u4e0d\u8981\u9057\u6f0f\u5185\u5bb9": 9, "\u4e0d\u8981\u6dfb\u52a0\u6587\u672c\u4e2d\u6ca1\u6709\u7684\u60c5\u8282": 9, "\u7b26\u5408\u539f\u6587\u4e8b\u5b9e": 9, "\u8054\u7cfb\u4e0a\u4e0b\u6587\u8bf4\u660e\u524d\u56e0\u540e\u679c": 9, "\u4f46\u4ecd\u7136\u9700\u8981\u7b26\u5408\u4e8b\u5b9e": 9, "\u4e0d\u8981\u5305\u542b\u4e3b\u89c2\u770b\u6cd5": 9, "\u6ce8\u610f\u8981\u5c3d\u53ef\u80fd\u4fdd\u7559\u6587\u672c\u7684\u4e13\u6709\u540d\u8bcd": 9, "\u6ce8\u610f\u76f8\u5173\u4eba\u7269\u9700\u8981\u5728\u5bf9\u5e94\u60c5\u8282\u4e2d\u51fa\u73b0": 9, "\u53ea\u62bd\u53d6\u60c5\u8282\u4e2d\u7684\u4e3b\u8981\u4eba\u7269": 9, "\u4e0d\u8981\u9057\u6f0f\u60c5\u8282\u7684\u4e3b\u8981\u4eba\u7269": 9, "\u603b\u7ed3\u683c\u5f0f\u5982\u4e0b": 9, "\u60c5\u82821": 9, "\u60c5\u8282\u63cf\u8ff0": 9, "\u76f8\u5173\u4eba\u7269": 9, "\u4eba\u72691": 9, "\u4eba\u72692": 9, "\u4eba\u72693": 9, "\u60c5\u82822": 9, "\u60c5\u82823": 9, "\u60c5\u8282": 9, "extractkeywordmapp": [9, 13], "keyword_kei": 9, "__dj__keyword__": 9, "topic": 9, "entir": 9, "These": 9, "idea": 9, "present": 9, "content_keyword": 9, "high_level_keyword": 9, "\u51b3\u7b56\u5236\u5b9a": 9, "\u5b87\u5b99\u610f\u4e49": 9, "extractnicknamemapp": [9, 13], "nickname_kei": 9, "__dj__nickname__": 9, "nicknam": 9, "\u7ed9\u5b9a\u4f60\u4e00\u6bb5\u6587\u672c": 9, "\u4f60\u7684\u4efb\u52a1\u662f\u5c06\u4eba\u7269\u4e4b\u95f4\u7684\u79f0\u547c\u65b9\u5f0f": 9, "\u6635\u79f0": 9, "\u63d0\u53d6\u51fa\u6765": 9, "\u9700\u8981\u7ed9\u51fa\u8bf4\u8bdd\u4eba\u5bf9\u88ab\u79f0\u547c\u4eba\u7684\u79f0\u547c": 9, "\u4e0d\u8981\u641e\u53cd\u4e86": 9, "\u76f8\u540c\u7684\u8bf4\u8bdd\u4eba\u548c\u88ab\u79f0\u547c\u4eba\u6700\u591a\u7ed9\u51fa\u4e00\u4e2a\u6700\u5e38\u7528\u7684\u79f0\u547c": 9, "\u8bf7\u4e0d\u8981\u8f93\u51fa\u4e92\u76f8\u6ca1\u6709\u6635\u79f0\u7684\u79f0\u547c\u65b9\u5f0f": 9, "\u8f93\u51fa\u683c\u5f0f\u5982\u4e0b": 9, "\u79f0\u547c\u65b9\u5f0f1": 9, "\u8bf4\u8bdd\u4eba": 9, "\u88ab\u79f0\u547c\u4eba": 9, "\u7684\u6635\u79f0": 9, "\u79f0\u547c\u65b9\u5f0f2": 9, "\u79f0\u547c\u65b9\u5f0f3": 9, "\u79f0\u547c\u65b9\u5f0f": 9, "doubl": 9, "fixunicodemapp": [9, 13], "fix": 9, "unicod": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "generateqafromexamplesmapp": [9, 13], "hf_model": 9, "qwen": 9, "qwen2": 9, "7b": 9, "instruct": 9, "seed_fil": 9, "example_num": 9, "similarity_threshold": 9, "example_templ": 9, "enable_vllm": 9, "your": 9, "\u8bf7\u4f60\u4ed4\u7ec6\u89c2\u5bdf\u591a\u4e2a\u793a\u4f8b\u6570\u636e\u7684\u8f93\u5165\u548c\u8f93\u51fa": 9, "\u6309\u7167\u4f60\u7684\u7406\u89e3": 9, "\u603b\u7ed3\u51fa\u76f8\u5e94\u89c4\u77e9": 9, "\u7136\u540e\u5199\u51fa\u4e00\u4e2a\u65b0\u7684": 9, "\u6ce8\u610f": 9, "\u65b0\u751f\u6210\u7684": 9, "\u9700\u8981\u6ee1\u8db3\u5982\u4e0b\u8981\u6c42": 9, "\u751f\u6210\u7684": 9, "\u4e0d\u80fd\u4e0e\u8f93\u5165\u7684": 9, "\u4e00\u81f4": 9, "\u4f46\u662f\u9700\u8981\u4fdd\u6301\u683c\u5f0f\u76f8\u540c": 9, "\u4e0d\u4e00\u5b9a\u8981\u5c40\u9650\u4e8e\u8f93\u5165": 9, "\u7684\u8bdd\u9898\u6216\u9886\u57df": 9, "\u9700\u8981\u6b63\u786e\u56de\u7b54\u751f\u6210\u7684": 9, "\u63d0\u4f9b\u7684": 9, "\u53ef\u80fd\u662f\u591a\u8f6e\u5bf9\u8bdd": 9, "\u4e5f\u53ef\u4ee5\u662f\u591a\u8f6e": 9, "\u5fc5\u987b\u6210\u5bf9\u51fa\u73b0": 9, "\u800c\u4e14": 9, "\u9700\u8981\u5728": 9, "\u4e4b\u524d": 9, "default_example_templ": 9, "n\u5982\u4e0b\u662f\u4e00\u6761\u793a\u4f8b\u6570\u636e": 9, "hugginfac": 9, "id": 9, "chatml": 9, "put": 9, "qa": 9, "guid": 9, "placehold": 9, "vllm": 9, "infer": 9, "acceler": 9, "qa_exampl": 9, "generateqafromtextmapp": [9, 13], "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "suitabl": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imageblurmapp": [9, 13], "p": 9, "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "blure": 9, "kernel": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "api_kei": 9, "max_token": 9, "user_prompt": 9, "user_prompt_kei": 9, "keep_original_sampl": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "authent": 9, "guidanc": [9, 13], "gpt4": 9, "uers_prompt_kei": 9, "imagecaptioningmapp": [9, 13], "hf_img2seq": 9, "blip2": 9, "opt": 9, "caption_num": 9, "keep_candidate_mod": 9, "random_ani": 9, "prompt_kei": 9, "caption": 9, "anoth": 9, "how": 9, "candid": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "similar_on": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "point": 9, "fp16": 9, "bf16": 9, "branch": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "produc": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "imagetaggingmapp": [9, 13], "__dj__image_tags__": 9, "nlpaugenmapp": [9, 13], "sequenti": 9, "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "delete_random_char": 9, "swap_random_char": 9, "insert_random_char": 9, "simpli": 9, "nlpaug": 9, "librari": 9, "semant": 9, "significantli": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "delet": 9, "love": 9, "swap": 9, "contigu": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "nlpcdazhmapp": [9, 13], "replace_similar_word": 9, "replace_homophone_char": 9, "replace_equivalent_num": 9, "nlpcda": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "optimizeqamapp": [9, 13], "\u8bf7\u4f18\u5316\u8f93\u5165\u7684\u95ee\u7b54\u5bf9": 9, "\u4f7f": 9, "\u90fd\u66f4\u52a0\u8be6\u7ec6": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f": 9, "\u76f4\u63a5\u8f93\u51fa\u4f18\u5316\u540e\u7684\u95ee\u7b54\u5bf9": 9, "n\u4f18\u5316\u540e\u7684\u95ee\u9898": 9, "n\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "sure": 9, "optimizequerymapp": [9, 13], "\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u5c06\u5176\u66f4\u52a0\u8be6\u7ec6\u5177\u4f53": 9, "\u4f46\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684": 9, "optimizeresponsemapp": [9, 13], "\u8bf7\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u4f46\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "pairpreferencemapp": [9, 13], "rejected_kei": 9, "rejected_respons": 9, "reason_kei": 9, "reason": 9, "prefer": 9, "\u4f60\u7684\u4efb\u52a1\u662f\u6839\u636e\u53c2\u8003\u4fe1\u606f\u4fee\u6539\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u5728\u8bed\u8a00\u98ce\u683c": 9, "\u4e8b\u5b9e\u6027": 9, "\u4eba\u7269\u8eab\u4efd": 9, "\u7acb\u573a\u7b49\u4efb\u4e00\u65b9\u9762\u4e0e\u539f\u56de\u7b54\u76f8\u53cd": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f\u8f93\u51fa": 9, "\u4e0d\u8981\u8f93\u51fa\u5176\u4ed6\u591a\u4f59\u5185\u5bb9": 9, "n\u751f\u6210\u7684\u65b0\u56de\u7b54": 9, "\u539f\u56e0": 9, "n\u751f\u6210\u8be5\u56de\u7b54\u7684\u539f\u56e0": 9, "n\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "repons": 9, "reject": 9, "failur": 9, "punctuationnormalizationmapp": [9, 13], "removebibliographymapp": [9, 13], "bibliographi": 9, "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "letter": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "removetabletextmapp": [9, 13], "min_col": 9, "max_col": 9, "20": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "replacecontentmapp": [9, 13], "design": 9, "sentencesplitmapp": [9, 13], "textchunkmapp": [9, 13], "split_pattern": 9, "overlap_len": 9, "len": 9, "forc": 9, "cut": 9, "offerd": 9, "tiktoken": 9, "dashscop": 9, "72b": 9, "recursively_chunk": 9, "get_text_chunk": 9, "videocaptioningfromaudiomapp": [9, 13], "stream": 9, "videocaptioningfromframesmapp": [9, 13], "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "too": 9, "bring": 9, "frequent": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videoffmpegwrappedmapp": [9, 13], "videofaceblurmapp": [9, 13], "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "adjust": 9, "either": 9, "enlarg": 9, "accept": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "futur": 9, "necessari": 9, "ensur": 9, "integ": 9, "even": 9, "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "progress": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "__dj__video_audio_tags__": 9, "spectrogram": 9, "hf": 9, "trust": 9, "videotaggingfromframesmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "get_init_configs"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"], [2, 3, 1, "", "prepare_side_configs"]], "data_juicer.core": [[3, 1, 1, "", "Adapter"], [3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "Monitor"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Adapter": [[3, 4, 1, "", "MAX_BATCH_SIZE"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "adapt_workloads"], [3, 2, 1, "", "batch_size_strategy"], [3, 2, 1, "", "execute_and_probe"], [3, 2, 1, "", "probe_small_batch"], [3, 2, 1, "", "take_batch"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.Monitor": [[3, 4, 1, "", "DYNAMIC_FIELDS"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "analyze_resource_util_list"], [3, 2, 1, "", "analyze_single_resource_util"], [3, 2, 1, "", "draw_resource_util_graph"], [3, 2, 1, "", "monitor_all_resources"], [3, 2, 1, "", "monitor_current_resources"], [3, 2, 1, "", "monitor_func"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "load_from_disk"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "EmptyFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RayEmptyFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.EmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RayEmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats_batched"], [5, 2, 1, "", "compute_stats_single"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "split_text_by_punctuation"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats_single"], [7, 2, 1, "", "process_single"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceCountFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImagePairSimilarityFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoMotionScoreRaftFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceCountFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImagePairSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_flow"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"], [8, 2, 1, "", "setup_model"]], "data_juicer.ops.filter.VideoMotionScoreRaftFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_flow"], [8, 2, 1, "", "setup_model"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "CalibrateQAMapper"], [9, 1, 1, "", "CalibrateQueryMapper"], [9, 1, 1, "", "CalibrateResponseMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractEntityAttributeMapper"], [9, 1, 1, "", "ExtractEntityRelationMapper"], [9, 1, 1, "", "ExtractEventMapper"], [9, 1, 1, "", "ExtractKeywordMapper"], [9, 1, 1, "", "ExtractNicknameMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "GenerateQAFromExamplesMapper"], [9, 1, 1, "", "GenerateQAFromTextMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "ImageTaggingMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "OptimizeQAMapper"], [9, 1, 1, "", "OptimizeQueryMapper"], [9, 1, 1, "", "OptimizeResponseMapper"], [9, 1, 1, "", "PairPreferenceMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "TextChunkMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.CalibrateQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_REFERENCE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.CalibrateQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.CalibrateResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractEntityAttributeMapper": [[9, 4, 1, "", "DEFAULT_ATTR_PATTERN_TEMPLATE"], [9, 4, 1, "", "DEFAULT_DEMON_PATTERN"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT_TEMPLATE"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractEntityRelationMapper": [[9, 4, 1, "", "DEFAULT_COMPLETION_DELIMITER"], [9, 4, 1, "", "DEFAULT_CONTINUE_PROMPT"], [9, 4, 1, "", "DEFAULT_ENTITY_PATTERN"], [9, 4, 1, "", "DEFAULT_ENTITY_TYPES"], [9, 4, 1, "", "DEFAULT_IF_LOOP_PROMPT"], [9, 4, 1, "", "DEFAULT_PROMPT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_RECORD_DELIMITER"], [9, 4, 1, "", "DEFAULT_RELATION_PATTERN"], [9, 4, 1, "", "DEFAULT_TUPLE_DELIMITER"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "add_message"], [9, 2, 1, "", "light_rag_extraction"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ExtractEventMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractKeywordMapper": [[9, 4, 1, "", "DEFAULT_COMPLETION_DELIMITER"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_PROMPT_TEMPLATE"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ExtractNicknameMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.GenerateQAFromExamplesMapper": [[9, 4, 1, "", "DEFAULT_EXAMPLE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.GenerateQAFromTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageTaggingMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.OptimizeQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.OptimizeQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.OptimizeResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.PairPreferenceMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.TextChunkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_text_chunks"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "recursively_chunk"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "}": [[3, "id1"], [3, "id2"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "get_init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.get_init_configs"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "prepare_side_configs() (in module data_juicer.config)": [[2, "data_juicer.config.prepare_side_configs"]], "adapter (class in data_juicer.core)": [[3, "data_juicer.core.Adapter"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "dynamic_fields (data_juicer.core.monitor attribute)": [[3, "data_juicer.core.Monitor.DYNAMIC_FIELDS"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "max_batch_size (data_juicer.core.adapter attribute)": [[3, "data_juicer.core.Adapter.MAX_BATCH_SIZE"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "monitor (class in data_juicer.core)": [[3, "data_juicer.core.Monitor"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.__init__"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "adapt_workloads() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.adapt_workloads"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "analyze_resource_util_list() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_resource_util_list"]], "analyze_single_resource_util() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_single_resource_util"]], "batch_size_strategy() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.batch_size_strategy"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "draw_resource_util_graph() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.draw_resource_util_graph"]], "execute_and_probe() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.execute_and_probe"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "load_from_disk() (data_juicer.core.nesteddataset static method)": [[3, "data_juicer.core.NestedDataset.load_from_disk"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "monitor_all_resources() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.monitor_all_resources"]], "monitor_current_resources() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_current_resources"]], "monitor_func() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_func"]], "probe_small_batch() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.probe_small_batch"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "take_batch() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.take_batch"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "emptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.EmptyFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "rayemptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.RayEmptyFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.emptyformatter attribute)": [[4, "data_juicer.format.EmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.rayemptyformatter attribute)": [[4, "data_juicer.format.RayEmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "null_value (data_juicer.format.emptyformatter property)": [[4, "data_juicer.format.EmptyFormatter.null_value"]], "null_value (data_juicer.format.rayemptyformatter property)": [[4, "data_juicer.format.RayEmptyFormatter.null_value"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_single"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "process_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_batched"]], "process_batched() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_batched"]], "process_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_single"]], "process_single() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_single"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "split_text_by_punctuation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_text_by_punctuation"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats_single"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "process_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process_single"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefacecountfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imagepairsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videomotionscoreraftfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_flow() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_flow"]], "compute_flow() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.compute_flow"]], "compute_stats_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats_single"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process_batched"]], "process_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process_single"]], "process_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process_single"]], "process_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process_single"]], "process_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process_single"]], "process_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process_single"]], "process_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process_single"]], "process_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process_single"]], "process_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process_single"]], "process_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process_single"]], "setup_model() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.setup_model"]], "setup_model() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.setup_model"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "calibrateqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper"]], "calibratequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper"]], "calibrateresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "default_attr_pattern_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_ATTR_PATTERN_TEMPLATE"]], "default_completion_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_COMPLETION_DELIMITER"]], "default_completion_delimiter (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_COMPLETION_DELIMITER"]], "default_continue_prompt (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_CONTINUE_PROMPT"]], "default_demon_pattern (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_DEMON_PATTERN"]], "default_entity_pattern (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_ENTITY_PATTERN"]], "default_entity_types (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_ENTITY_TYPES"]], "default_example_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_EXAMPLE_TEMPLATE"]], "default_if_loop_prompt (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_IF_LOOP_PROMPT"]], "default_input_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_INPUT_TEMPLATE"]], "default_output_pattern (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_OUTPUT_PATTERN"]], "default_prompt_template (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_PROMPT_TEMPLATE"]], "default_prompt_template (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_PROMPT_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_record_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_RECORD_DELIMITER"]], "default_reference_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_REFERENCE_TEMPLATE"]], "default_relation_pattern (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_RELATION_PATTERN"]], "default_system_prompt (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.calibratequerymapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.calibrateresponsemapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizequerymapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeresponsemapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_SYSTEM_PROMPT_TEMPLATE"]], "default_tuple_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_TUPLE_DELIMITER"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractentityattributemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper"]], "extractentityrelationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper"]], "extracteventmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEventMapper"]], "extractkeywordmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper"]], "extractnicknamemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "generateqafromexamplesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper"]], "generateqafromtextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "imagetaggingmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "optimizeqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper"]], "optimizequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper"]], "optimizeresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper"]], "pairpreferencemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "textchunkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.TextChunkMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "add_message() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.add_message"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "build_input() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.build_input"]], "build_input() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.build_input"]], "build_input() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.build_input"]], "build_input() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.build_input"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "get_text_chunks() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.get_text_chunks"]], "light_rag_extraction() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.light_rag_extraction"]], "parse_output() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.calibratequerymapper method)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.calibrateresponsemapper method)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizequerymapper method)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeresponsemapper method)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.parse_output"]], "process_batched() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process_batched"]], "process_single() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.process_single"]], "process_single() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.process_single"]], "process_single() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process_single"]], "recursively_chunk() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.recursively_chunk"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file +Search.setIndex({"alltitles": {"API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]], "Tutorial": [[13, "tutorial"]], "Welcome to data-juicer\u2019s documentation!": [[13, null]], "data_juicer": [[0, null], [14, null]], "data_juicer.analysis": [[1, null]], "data_juicer.config": [[2, null]], "data_juicer.core": [[3, null]], "data_juicer.format": [[4, null]], "data_juicer.ops": [[5, null]], "data_juicer.ops.common": [[6, null]], "data_juicer.ops.deduplicator": [[7, null]], "data_juicer.ops.filter": [[8, null]], "data_juicer.ops.mapper": [[9, null]], "data_juicer.ops.selector": [[10, null]], "data_juicer.tools": [[11, null]], "data_juicer.utils": [[12, null]], "}": [[3, "id1"], [3, "id2"]]}, "docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "envversion": {"sphinx": 64, "sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1}, "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "indexentries": {"__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__", false]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__", false]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__", false]], "__init__() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.__init__", false]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__", false]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__", false]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__", false]], "__init__() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.__init__", false]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__", false]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__", false]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__", false]], "__init__() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.__init__", false]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__", false]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__", false]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__", false]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__", false]], "__init__() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.__init__", false]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__", false]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__", false]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__", false]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__", false]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__", false]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__", false]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__", false]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__", false]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__", false]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__", false]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__", false]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__", false]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__", false]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__", false]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__", false]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__", false]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__", false]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__", false]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__", false]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__", false]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__", false]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__", false]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__", false]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__", false]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__", false]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__", false]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__", false]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__", false]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__", false]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__", false]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__", false]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.pythonfilemapper method)": [[9, "data_juicer.ops.mapper.PythonFileMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.pythonlambdamapper method)": [[9, "data_juicer.ops.mapper.PythonLambdaMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__", false]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__", false]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__", false]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__", false]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__", false]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__", false]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__", false]], "adapt_workloads() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.adapt_workloads", false]], "adapter (class in data_juicer.core)": [[3, "data_juicer.core.Adapter", false]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column", false]], "add_message() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.add_message", false]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter", false]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze", false]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze", false]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze", false]], "analyze_resource_util_list() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_resource_util_list", false]], "analyze_single_resource_util() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_single_resource_util", false]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer", false]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter", false]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper", false]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter", false]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter", false]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors", false]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter", false]], "batch_size_strategy() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.batch_size_strategy", false]], "build_input() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.build_input", false]], "build_input() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.build_input", false]], "build_input() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.build_input", false]], "build_input() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.build_input", false]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash", false]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash", false]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash", false]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash", false]], "calibrateqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper", false]], "calibratequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper", false]], "calibrateresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper", false]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter", false]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper", false]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper", false]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper", false]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper", false]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper", false]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper", false]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files", false]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis", false]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute", false]], "compute_flow() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_flow", false]], "compute_flow() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.compute_flow", false]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash", false]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash", false]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash", false]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash", false]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash", false]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash", false]], "compute_stats_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats_batched", false]], "compute_stats_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats_batched", false]], "compute_stats_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats_single", false]], "compute_stats_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats_single", false]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter", false]], "cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count", false]], "data_juicer": [[0, "module-data_juicer", false]], "data_juicer.analysis": [[1, "module-data_juicer.analysis", false]], "data_juicer.config": [[2, "module-data_juicer.config", false]], "data_juicer.core": [[3, "module-data_juicer.core", false]], "data_juicer.format": [[4, "module-data_juicer.format", false]], "data_juicer.ops": [[5, "module-data_juicer.ops", false]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common", false]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator", false]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter", false]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper", false]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector", false]], "data_juicer.tools": [[11, "module-data_juicer.tools", false]], "data_juicer.utils": [[12, "module-data_juicer.utils", false]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator", false]], "default_attr_pattern_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_ATTR_PATTERN_TEMPLATE", false]], "default_completion_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_COMPLETION_DELIMITER", false]], "default_completion_delimiter (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_COMPLETION_DELIMITER", false]], "default_continue_prompt (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_CONTINUE_PROMPT", false]], "default_demon_pattern (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_DEMON_PATTERN", false]], "default_entity_pattern (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_ENTITY_PATTERN", false]], "default_entity_types (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_ENTITY_TYPES", false]], "default_example_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_EXAMPLE_TEMPLATE", false]], "default_if_loop_prompt (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_IF_LOOP_PROMPT", false]], "default_input_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_INPUT_TEMPLATE", false]], "default_input_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_INPUT_TEMPLATE", false]], "default_input_template (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_INPUT_TEMPLATE", false]], "default_input_template (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_INPUT_TEMPLATE", false]], "default_input_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_INPUT_TEMPLATE", false]], "default_input_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_INPUT_TEMPLATE", false]], "default_input_template (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_INPUT_TEMPLATE", false]], "default_output_pattern (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_OUTPUT_PATTERN", false]], "default_output_pattern (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_OUTPUT_PATTERN", false]], "default_output_pattern (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_OUTPUT_PATTERN", false]], "default_output_pattern (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_OUTPUT_PATTERN", false]], "default_output_pattern (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_OUTPUT_PATTERN", false]], "default_output_pattern (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_OUTPUT_PATTERN", false]], "default_output_pattern (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_OUTPUT_PATTERN", false]], "default_prompt_template (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_PROMPT_TEMPLATE", false]], "default_prompt_template (data_juicer.ops.mapper.extractkeywordmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.DEFAULT_PROMPT_TEMPLATE", false]], "default_qa_pair_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_QA_PAIR_TEMPLATE", false]], "default_qa_pair_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_QA_PAIR_TEMPLATE", false]], "default_qa_pair_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_QA_PAIR_TEMPLATE", false]], "default_record_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_RECORD_DELIMITER", false]], "default_reference_template (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_REFERENCE_TEMPLATE", false]], "default_relation_pattern (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_RELATION_PATTERN", false]], "default_system_prompt (data_juicer.ops.mapper.calibrateqamapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.calibratequerymapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.calibrateresponsemapper attribute)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.extracteventmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.extractnicknamemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.optimizequerymapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.optimizeresponsemapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt (data_juicer.ops.mapper.pairpreferencemapper attribute)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.DEFAULT_SYSTEM_PROMPT", false]], "default_system_prompt_template (data_juicer.ops.mapper.extractentityattributemapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.DEFAULT_SYSTEM_PROMPT_TEMPLATE", false]], "default_tuple_delimiter (data_juicer.ops.mapper.extractentityrelationmapper attribute)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.DEFAULT_TUPLE_DELIMITER", false]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis", false]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator", false]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator", false]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator", false]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box", false]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist", false]], "draw_resource_util_graph() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.draw_resource_util_graph", false]], "dynamic_fields (data_juicer.core.monitor attribute)": [[3, "data_juicer.core.Monitor.DYNAMIC_FIELDS", false]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE", false]], "emptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.EmptyFormatter", false]], "execute_and_probe() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.execute_and_probe", false]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor", false]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper", false]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export", false]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats", false]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config", false]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter", false]], "extractentityattributemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper", false]], "extractentityrelationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper", false]], "extracteventmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractEventMapper", false]], "extractkeywordmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper", false]], "extractnicknamemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper", false]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter", false]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter", false]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper", false]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter", false]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector", false]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict", false]], "generateqafromexamplesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper", false]], "generateqafromtextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper", false]], "get_init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.get_init_configs", false]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader", false]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document", false]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame", false]], "get_text_chunks() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.get_text_chunks", false]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document", false]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB", false]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter", false]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter", false]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper", false]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper", false]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper", false]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator", false]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper", false]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper", false]], "imagefacecountfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter", false]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter", false]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter", false]], "imagepairsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter", false]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter", false]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter", false]], "imagetaggingmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper", false]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter", false]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter", false]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter", false]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs", false]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available", false]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter", false]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB", false]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter", false]], "light_rag_extraction() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.light_rag_extraction", false]], "load_dataset() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.load_dataset", false]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset", false]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset", false]], "load_dataset() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.load_dataset", false]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset", false]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset", false]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter", false]], "load_from_disk() (data_juicer.core.nesteddataset static method)": [[3, "data_juicer.core.NestedDataset.load_from_disk", false]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops", false]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter", false]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map", false]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper", false]], "max_batch_size (data_juicer.core.adapter attribute)": [[3, "data_juicer.core.Adapter.MAX_BATCH_SIZE", false]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter", false]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config", false]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline", false]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB", false]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter", false]], "module": [[0, "module-data_juicer", false], [1, "module-data_juicer.analysis", false], [2, "module-data_juicer.config", false], [3, "module-data_juicer.core", false], [4, "module-data_juicer.format", false], [5, "module-data_juicer.ops", false], [6, "module-data_juicer.ops.common", false], [7, "module-data_juicer.ops.deduplicator", false], [8, "module-data_juicer.ops.filter", false], [9, "module-data_juicer.ops.mapper", false], [10, "module-data_juicer.ops.selector", false], [11, "module-data_juicer.tools", false], [12, "module-data_juicer.utils", false]], "monitor (class in data_juicer.core)": [[3, "data_juicer.core.Monitor", false]], "monitor_all_resources() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.monitor_all_resources", false]], "monitor_current_resources() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_current_resources", false]], "monitor_func() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_func", false]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset", false]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper", false]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper", false]], "null_value (data_juicer.format.emptyformatter property)": [[4, "data_juicer.format.EmptyFormatter.null_value", false]], "null_value (data_juicer.format.rayemptyformatter property)": [[4, "data_juicer.format.RayEmptyFormatter.null_value", false]], "optimizeqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper", false]], "optimizequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper", false]], "optimizeresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper", false]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis", false]], "pairpreferencemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper", false]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter", false]], "parse_output() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.calibratequerymapper method)": [[9, "data_juicer.ops.mapper.CalibrateQueryMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.calibrateresponsemapper method)": [[9, "data_juicer.ops.mapper.CalibrateResponseMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.optimizequerymapper method)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.optimizeresponsemapper method)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.parse_output", false]], "parse_output() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.parse_output", false]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter", false]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter", false]], "prepare_side_configs() (in module data_juicer.config)": [[2, "data_juicer.config.prepare_side_configs", false]], "probe_small_batch() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.probe_small_batch", false]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process", false]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process", false]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process", false]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process", false]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process", false]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process", false]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process", false]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process", false]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process", false]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process", false]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process", false]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process", false]], "process_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_batched", false]], "process_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process_batched", false]], "process_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process_batched", false]], "process_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process_batched", false]], "process_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process_batched", false]], "process_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process_batched", false]], "process_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process_batched", false]], "process_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process_batched", false]], "process_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process_batched", false]], "process_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process_batched", false]], "process_batched() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.extractentityattributemapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityAttributeMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.extracteventmapper method)": [[9, "data_juicer.ops.mapper.ExtractEventMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.pythonfilemapper method)": [[9, "data_juicer.ops.mapper.PythonFileMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.pythonlambdamapper method)": [[9, "data_juicer.ops.mapper.PythonLambdaMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process_batched", false]], "process_batched() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process_batched", false]], "process_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process_single", false]], "process_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_single", false]], "process_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process_single", false]], "process_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process_single", false]], "process_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process_single", false]], "process_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process_single", false]], "process_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process_single", false]], "process_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process_single", false]], "process_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process_single", false]], "process_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process_single", false]], "process_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process_single", false]], "process_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process_single", false]], "process_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process_single", false]], "process_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process_single", false]], "process_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process_single", false]], "process_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process_single", false]], "process_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process_single", false]], "process_single() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_single", false]], "process_single() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.calibrateqamapper method)": [[9, "data_juicer.ops.mapper.CalibrateQAMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.extractentityrelationmapper method)": [[9, "data_juicer.ops.mapper.ExtractEntityRelationMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.extractkeywordmapper method)": [[9, "data_juicer.ops.mapper.ExtractKeywordMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.extractnicknamemapper method)": [[9, "data_juicer.ops.mapper.ExtractNicknameMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.pairpreferencemapper method)": [[9, "data_juicer.ops.mapper.PairPreferenceMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.pythonfilemapper method)": [[9, "data_juicer.ops.mapper.PythonFileMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.pythonlambdamapper method)": [[9, "data_juicer.ops.mapper.PythonLambdaMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process_single", false]], "process_single() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process_single", false]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper", false]], "pythonfilemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PythonFileMapper", false]], "pythonlambdamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PythonLambdaMapper", false]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample", false]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector", false]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector", false]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator", false]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator", false]], "rayemptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.RayEmptyFormatter", false]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator", false]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator", false]], "recursively_chunk() (data_juicer.ops.mapper.textchunkmapper method)": [[9, "data_juicer.ops.mapper.TextChunkMapper.recursively_chunk", false]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column", false]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter", false]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns", false]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper", false]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper", false]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper", false]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper", false]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper", false]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper", false]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper", false]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper", false]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper", false]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper", false]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run", false]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run", false]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run", false]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run", false]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run", false]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run", false]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data", false]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select", false]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns", false]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector", false]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper", false]], "setup_model() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.setup_model", false]], "setup_model() (data_juicer.ops.filter.videomotionscoreraftfilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter.setup_model", false]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word", false]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings", false]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter", false]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter", false]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter", false]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace", false]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace", false]], "split_text_by_punctuation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_text_by_punctuation", false]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration", false]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter", false]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY", false]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip", false]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES", false]], "suffixes (data_juicer.format.emptyformatter attribute)": [[4, "data_juicer.format.EmptyFormatter.SUFFIXES", false]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES", false]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES", false]], "suffixes (data_juicer.format.rayemptyformatter attribute)": [[4, "data_juicer.format.RayEmptyFormatter.SUFFIXES", false]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES", false]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES", false]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter", false]], "take_batch() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.take_batch", false]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter", false]], "textchunkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.TextChunkMapper", false]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter", false]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter", false]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter", false]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB", false]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json", false]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl", false]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet", false]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter", false]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector", false]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper", false]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator", false]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter", false]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper", false]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer", false]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter", false]], "update_args() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.update_args", false]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter", false]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter", false]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper", false]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper", false]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper", false]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper", false]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator", false]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter", false]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper", false]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper", false]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter", false]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter", false]], "videomotionscoreraftfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreRaftFilter", false]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter", false]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter", false]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper", false]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper", false]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper", false]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter", false]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper", false]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper", false]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper", false]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper", false]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter", false]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper", false]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter", false]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper", false]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter", false]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation", false]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement", false]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter", false]]}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "get_init_configs"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"], [2, 3, 1, "", "prepare_side_configs"]], "data_juicer.core": [[3, 1, 1, "", "Adapter"], [3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "Monitor"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Adapter": [[3, 4, 1, "", "MAX_BATCH_SIZE"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "adapt_workloads"], [3, 2, 1, "", "batch_size_strategy"], [3, 2, 1, "", "execute_and_probe"], [3, 2, 1, "", "probe_small_batch"], [3, 2, 1, "", "take_batch"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.Monitor": [[3, 4, 1, "", "DYNAMIC_FIELDS"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "analyze_resource_util_list"], [3, 2, 1, "", "analyze_single_resource_util"], [3, 2, 1, "", "draw_resource_util_graph"], [3, 2, 1, "", "monitor_all_resources"], [3, 2, 1, "", "monitor_current_resources"], [3, 2, 1, "", "monitor_func"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "load_from_disk"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"], [3, 2, 1, "", "update_args"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "EmptyFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RayEmptyFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.EmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RayEmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats_batched"], [5, 2, 1, "", "compute_stats_single"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "split_text_by_punctuation"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats_single"], [7, 2, 1, "", "process_single"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceCountFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImagePairSimilarityFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoMotionScoreRaftFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceCountFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImagePairSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_flow"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"], [8, 2, 1, "", "setup_model"]], "data_juicer.ops.filter.VideoMotionScoreRaftFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_flow"], [8, 2, 1, "", "setup_model"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "CalibrateQAMapper"], [9, 1, 1, "", "CalibrateQueryMapper"], [9, 1, 1, "", "CalibrateResponseMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractEntityAttributeMapper"], [9, 1, 1, "", "ExtractEntityRelationMapper"], [9, 1, 1, "", "ExtractEventMapper"], [9, 1, 1, "", "ExtractKeywordMapper"], [9, 1, 1, "", "ExtractNicknameMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "GenerateQAFromExamplesMapper"], [9, 1, 1, "", "GenerateQAFromTextMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "ImageTaggingMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "OptimizeQAMapper"], [9, 1, 1, "", "OptimizeQueryMapper"], [9, 1, 1, "", "OptimizeResponseMapper"], [9, 1, 1, "", "PairPreferenceMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "PythonFileMapper"], [9, 1, 1, "", "PythonLambdaMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "TextChunkMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.CalibrateQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_REFERENCE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.CalibrateQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.CalibrateResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractEntityAttributeMapper": [[9, 4, 1, "", "DEFAULT_ATTR_PATTERN_TEMPLATE"], [9, 4, 1, "", "DEFAULT_DEMON_PATTERN"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT_TEMPLATE"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractEntityRelationMapper": [[9, 4, 1, "", "DEFAULT_COMPLETION_DELIMITER"], [9, 4, 1, "", "DEFAULT_CONTINUE_PROMPT"], [9, 4, 1, "", "DEFAULT_ENTITY_PATTERN"], [9, 4, 1, "", "DEFAULT_ENTITY_TYPES"], [9, 4, 1, "", "DEFAULT_IF_LOOP_PROMPT"], [9, 4, 1, "", "DEFAULT_PROMPT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_RECORD_DELIMITER"], [9, 4, 1, "", "DEFAULT_RELATION_PATTERN"], [9, 4, 1, "", "DEFAULT_TUPLE_DELIMITER"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "add_message"], [9, 2, 1, "", "light_rag_extraction"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ExtractEventMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractKeywordMapper": [[9, 4, 1, "", "DEFAULT_COMPLETION_DELIMITER"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_PROMPT_TEMPLATE"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ExtractNicknameMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.GenerateQAFromExamplesMapper": [[9, 4, 1, "", "DEFAULT_EXAMPLE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.GenerateQAFromTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageTaggingMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.OptimizeQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.OptimizeQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.OptimizeResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.PairPreferenceMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.PythonFileMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.PythonLambdaMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.TextChunkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_text_chunks"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "recursively_chunk"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "terms": {"": [1, 3, 7, 8, 9], "0": [3, 4, 5, 7, 8, 9, 10], "003": 8, "045": 8, "0b5": 9, "0x20": 9, "1": [1, 3, 4, 7, 8, 9, 10], "10": [3, 8, 9], "10000": 3, "1024": 3, "1048576": 3, "1073741824": 3, "1099511627776": 3, "10ve": 9, "12039": 8, "15": 9, "1500": 8, "1b8": 9, "1tb": 8, "2": [3, 6, 8, 9], "20": 9, "2003": 8, "21": [8, 9], "24": 13, "25": 8, "256": 7, "27": 9, "2nb": 9, "3": [8, 9], "308": 8, "333": 8, "4": [7, 8, 9], "42": 4, "4593": 9, "4b": 9, "4o": 9, "5": [3, 7, 8, 9], "500": [8, 9], "6": [7, 8, 9], "6380": 7, "7": [7, 9], "72b": 9, "7976931348623157e": 8, "7b": 9, "8": [3, 8, 9], "8b": 9, "9": [3, 8, 9], "9223372036854775807": [8, 9], "95": [8, 9], "9b": 8, "A": [3, 5, 7, 9], "And": [7, 9], "As": 8, "By": [8, 9], "For": [3, 5, 7, 8, 9], "If": [1, 3, 7, 8, 9], "In": [1, 3], "It": [3, 4, 7, 8, 9], "NO": 9, "One": 9, "The": [3, 4, 5, 8, 9, 10], "These": 9, "To": 9, "__dj__attribute__": 9, "__dj__attribute_description__": 9, "__dj__attribute_support_text__": 9, "__dj__entity__": 9, "__dj__event_description__": 9, "__dj__image_tags__": 9, "__dj__keyword__": 9, "__dj__main_entity__": 9, "__dj__nickname__": 9, "__dj__relation__": 9, "__dj__relevant_characters__": 9, "__dj__support_text__": 9, "__dj__video_audio_tags__": 9, "__dj__video_frame_tags__": [8, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "__path__": 2, "ab": 8, "abil": 9, "about": 9, "abov": [3, 9], "abstractfilesystem": 3, "acceler": 9, "accept": 9, "access": 3, "accord": [3, 4, 5, 8, 9], "account": 8, "acknowledg": 9, "action": [8, 9], "activ": 9, "ad": [3, 6, 9], "adapt": [3, 13], "adapt_workload": 3, "adaptivedetector": 9, "add": [3, 4, 9], "add_column": 3, "add_final_scen": 9, "add_messag": 9, "add_suffix": 4, "addit": [8, 9], "address": 9, "adjust": 9, "adopt": 8, "aesthet": 8, "affect": 9, "after": [1, 3, 6, 7, 8, 9], "against": 9, "ai": [8, 9], "akin": 9, "alert": 9, "alex": 9, "algorith": 8, "algorithm": [3, 7, 9], "alibaba": 9, "all": [1, 3, 6, 8, 9], "all_keyfram": [8, 9], "allow": [8, 9], "almost": 9, "alphabet": [7, 8, 9], "alphanumer": 8, "alphanumericfilt": [8, 13], "also": 6, "although": 7, "alwai": 7, "among": 9, "amount": 9, "amrul": 8, "an": [1, 3, 4, 5, 7, 8, 9], "analysi": [3, 13], "analyz": [1, 2, 3, 13], "analyze_resource_util_list": 3, "analyze_single_resource_util": 3, "ani": [3, 6, 8, 9], "annot": [3, 7, 8, 9, 10], "anoth": 9, "answer": 9, "anticip": 9, "anxieti": 9, "any_or_al": [8, 9], "anyth": 8, "api": [3, 9], "api_endpoint": 9, "api_kei": 9, "api_model": 9, "appear": 9, "appli": [1, 3, 7, 9, 10], "approxim": 8, "ar": [2, 3, 6, 7, 8, 9, 10], "area": 8, "arg": [2, 3, 4, 5, 7, 8, 9, 10], "argument": [1, 3, 5, 8, 9], "arxiv": 8, "asm": 4, "aspect": [8, 9], "aspectratio": [8, 9], "asset": 8, "assist": 9, "associ": 9, "ast": 9, "attempt": 9, "attitud": 9, "attr_pattern_templ": 9, "attribut": 9, "attribute_desc_kei": 9, "attribute_kei": 9, "attribute_nam": 9, "audio": [5, 8, 9], "audio_kei": 5, "audiodurationfilt": [8, 13], "audioffmpegwrappedmapp": [9, 13], "audionmfsnrfilt": [8, 13], "audioset": 9, "audiosizefilt": [8, 13], "aug_num": 9, "augment": [3, 6, 8, 9], "authent": 9, "authoritarian": 9, "autonomi": 9, "ava1": 8, "avail": [3, 8], "avaliable_detector": 9, "averag": [3, 8], "averagelinelengthfilt": [8, 13], "avg": [3, 8], "aw": 9, "ax": 1, "b": 9, "back": 9, "backdrop": 9, "backend": 3, "baichuan2": 9, "balanc": 3, "band": 7, "bare": 9, "base": [1, 3, 4, 5, 7, 8, 9, 10], "base_b": 3, "baseformatt": 4, "bash": 4, "basic": 7, "bat": 4, "batch": [3, 9], "batch_size_strategi": 3, "batched_op": 9, "batchmapp": 3, "bbox": 8, "been": 9, "befor": [3, 8], "begin": 9, "being": [8, 9], "below": [8, 9], "besid": 9, "better": [3, 8], "between": [6, 7, 8, 9], "bf16": 9, "bibliographi": 9, "bigger": [4, 9], "blip": [8, 9], "blip2": 9, "blob": 8, "block": 7, "block_siz": 9, "blur": 9, "blur_typ": 9, "blure": 9, "bode": 9, "bodi": 9, "bool": [2, 3, 7, 8, 9, 10], "boolean": [5, 7, 8, 9], "both": [9, 10], "bottom": [8, 9], "bound": 10, "box": [1, 9], "branch": 9, "bring": 9, "brought": 9, "bucket": 3, "build": 9, "build_input": 9, "buzz": 9, "byte": [7, 8], "c": 4, "cach": [3, 8], "calcul": [7, 8, 9], "calculate_hash": 7, "calibr": 9, "calibrateqamapp": [9, 13], "calibratequerymapp": [9, 13], "calibrateresponsemapp": [9, 13], "call": [3, 9], "can": [3, 8, 9], "candid": 9, "capabl": 9, "caption": 9, "caption_kei": 9, "caption_num": 9, "captur": 9, "capture_stderr": 9, "case": [6, 7, 8, 9, 13], "cast": 9, "cc": 4, "central": 9, "certainti": 9, "cfg": [2, 3, 4], "cfg_after_merg": 2, "ch_sim": 8, "challeng": 9, "chang": [3, 9], "char": [6, 8, 9], "charact": [6, 7, 8, 9], "characterrepetitionfilt": [8, 13], "chars_to_remov": 9, "chatml": 9, "check": [2, 9], "checkpoint": 3, "chines": [6, 7, 8, 9], "chineseclip": 8, "chineseconvertmapp": [9, 13], "choic": [8, 9], "choos": 9, "chunk": [8, 9], "clash": 9, "class": [1, 3, 4, 5, 7, 8, 9, 10], "classifi": [8, 9], "classmethod": [3, 4], "clean": 9, "cleancopyrightmapp": [9, 13], "cleanemailmapp": [9, 13], "cleanhtmlmapp": [9, 13], "cleanipmapp": [9, 13], "cleanlinksmapp": [9, 13], "cleanup_cache_fil": 3, "clear": 3, "clearli": 9, "clench": 9, "clip": [8, 9], "close": 9, "closedunitinterv": 8, "cmake": 4, "cmd": 4, "coco": 8, "code": [2, 9], "col": 1, "collect": 9, "column": [1, 3, 9], "column_nam": 1, "columnwiseanalysi": [1, 3, 13], "com": 8, "combin": 9, "command": [2, 4, 9], "comment": 9, "commit": 9, "common": [3, 13], "commun": 9, "compar": 3, "comparison": 3, "competit": 9, "complet": 9, "completion_delimit": 9, "comprehens": 9, "compress": 3, "comput": [1, 3, 5, 6, 7, 8], "compute_flow": 8, "compute_hash": [5, 7], "compute_stats_batch": [5, 8], "compute_stats_singl": [5, 7, 8], "compvi": 9, "concaten": [6, 9], "concentr": 9, "concept": 9, "condit": [8, 9], "conduct": 5, "conf_thr": 8, "confid": 8, "config": [3, 5, 9, 13], "configur": [2, 3, 4, 9], "conflict": 9, "conifg": 2, "consequ": 4, "consid": [3, 7, 8, 9], "consider_text": 7, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_caption_from_video": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "constraint": 8, "construct": [3, 9], "constructor": 3, "contact": 9, "contain": [4, 6, 8, 9], "content": [3, 9], "content_keyword": 9, "contentdetector": 9, "context": [5, 7, 8, 9], "contigu": 9, "continu": 9, "continue_prompt": 9, "contrast": 9, "control": 9, "convers": 9, "convert": [6, 7, 9], "coodin": 9, "coordin": 9, "copi": 3, "copyright": 9, "core": 13, "corner": 9, "correspond": [8, 9, 10], "cosmic": 9, "could": 9, "count": [3, 8], "cpp": 4, "cpu": 3, "creat": 4, "cruz": 9, "css": 4, "csv": 4, "csvformatt": [4, 13], "cuda_device_count": [0, 14], "curr_fram": 8, "current": 3, "custom": 9, "cut": 9, "cv_classifi": [8, 9], "d": [3, 4, 9], "dashscop": 9, "data": [1, 3, 4, 5, 8, 9], "data_juic": 13, "datajuc": 2, "datas": 4, "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "dataset_path": [3, 4], "dataset_to_sampl": 3, "datasetdict": 3, "datasset": 4, "db": 8, "decid": [3, 5, 7, 8], "decreas": 9, "dedup": 8, "dedupl": [3, 5, 9, 13], "deep": 9, "default": [1, 2, 3, 4, 7, 8, 9], "default_attr_pattern_templ": 9, "default_completion_delimit": 9, "default_continue_prompt": 9, "default_demon_pattern": 9, "default_entity_pattern": 9, "default_entity_typ": 9, "default_example_templ": 9, "default_if_loop_prompt": 9, "default_input_templ": 9, "default_output_pattern": 9, "default_prompt_templ": 9, "default_qa_pair_templ": 9, "default_record_delimit": 9, "default_reference_templ": 9, "default_relation_pattern": 9, "default_system_prompt": 9, "default_system_prompt_templ": 9, "default_tuple_delimit": 9, "defaut": 2, "defin": 9, "definit": 9, "delet": 9, "delete_random_char": 9, "delete_random_word": 9, "delimit": [4, 9], "demo_pattern": 9, "demonstract": 9, "denois": 9, "denot": 9, "dens": 8, "depend": [8, 9], "descend": 10, "describ": 1, "descript": 9, "design": 9, "detail": [3, 8, 9, 13], "detect": [3, 7, 8, 9], "detection_method": 9, "detector": 9, "determin": [7, 9, 10], "devic": 9, "diagon": 8, "dialogu": 9, "dict": [2, 3, 9], "did": 9, "differ": [3, 4, 6, 7, 8, 9], "diffus": 9, "digit": 7, "dimens": [8, 9], "dir": 4, "directli": 9, "directori": [3, 4, 8], "disabl": 9, "discard": 9, "discoveri": 9, "disk": [1, 3], "dismiss": 9, "distanc": 7, "distribut": [1, 3, 9], "divers": [1, 9], "diversityanalysi": [1, 13], "divis": [8, 9], "djdataset": 3, "doc": [5, 7], "doc2qa": 9, "doc_typ": 9, "dockerfil": 4, "document": [6, 7, 8, 9], "documentdedupl": [7, 13], "documentminhashdedupl": [7, 13], "documentsimhashdedupl": [7, 13], "docx": [4, 8], "doubl": 9, "draw": 1, "draw_box": 1, "draw_hist": 1, "draw_resource_util_graph": 3, "drop": 9, "drop_no_head": 9, "drop_text": 9, "ds_dir": 4, "ds_file": 4, "due": 3, "dull": 9, "dup_pair": 3, "duplic": [3, 5, 7], "durat": [8, 9], "dure": 3, "dynam": 9, "dynamic_field": 3, "e": [2, 3, 4, 8, 9], "e501": 8, "each": [1, 3, 5, 7, 9], "earlier": 9, "easyocr": 8, "edg": [8, 9], "edit": 5, "effect": 3, "effici": 3, "ego4d": 9, "either": 9, "element": 6, "eleutherai": 8, "email": 9, "embed": 3, "emoji": 6, "empti": [4, 7, 9], "empty_hash_valu": 7, "emptyformatt": [4, 9, 13], "en": [1, 6, 8, 9], "enabl": [3, 9], "enable_vllm": 9, "encourag": 9, "encrypt": 9, "end": 9, "endpoint": 9, "energi": 9, "enforc": 9, "english": [7, 8, 9], "enhanc": 3, "enlarg": 9, "ensu": 9, "ensur": 9, "entir": 9, "entiti": [8, 9], "entity_attribute_kei": 9, "entity_descript": 9, "entity_kei": 9, "entity_nam": 9, "entity_pattern": 9, "entity_typ": 9, "entri": 2, "environ": [2, 3], "equal": [8, 9, 10], "equival": 9, "error": 9, "especi": [6, 8], "essenti": 9, "estim": 3, "etc": [1, 3, 4], "even": 9, "evenli": 8, "event": 9, "event_desc_kei": 9, "everi": 4, "exact": 7, "exampl": [3, 8, 9], "example_num": 9, "example_templ": 9, "exce": [3, 8, 9], "except": [3, 9], "exclud": 2, "execut": [3, 9], "execute_and_prob": 3, "executor": [2, 3, 13], "exist": 2, "expand": 9, "expandmacromapp": [9, 13], "expect": [2, 3, 9], "expens": 9, "experi": 9, "explan": 9, "explicitli": 3, "explor": 9, "export": [1, 3, 4, 5, 13], "export_compute_stat": 3, "export_config": [2, 13], "export_d": 3, "export_in_parallel": 3, "export_path": 3, "export_shard_s": 3, "export_stat": 3, "express": 9, "extent": 9, "extra": [3, 4, 7, 8, 9, 10], "extract": [3, 8, 9], "extractentityattributemapp": [9, 13], "extractentityrelationmapp": [9, 13], "extracteventmapp": [9, 13], "extractkeywordmapp": [9, 13], "extractnicknamemapp": [9, 13], "ey": 9, "f": 4, "f03": 4, "f08": 4, "f77": 4, "f90": 4, "f95": 4, "face": [8, 9], "factor": 3, "fade_bia": 9, "failur": 9, "falconsai": 8, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "falter": 9, "farneback": 8, "faster": 6, "featur": 4, "feature_kei": [4, 9], "ffmpeg": 9, "field": [3, 4, 5, 7, 8, 9, 10], "field_kei": [8, 10], "fieldinfo": [7, 9, 10], "figur": [1, 3, 9], "file": [1, 2, 3, 4, 5, 8, 9], "file_path": 9, "filesystem": 3, "filter": [3, 5, 7, 9, 13], "filter_kwarg": 9, "filter_nam": 9, "final": [7, 9], "finetun": 9, "finish": 9, "first": [3, 6, 7, 8, 9], "fix": 9, "fixunicodemapp": [9, 13], "flag": 8, "flagged_word": 8, "flagged_words_dir": 8, "flaggedwordfilt": [8, 13], "fleet": 9, "flip": [8, 9], "float": [3, 7, 8, 9, 10], "flow": 8, "flurri": 9, "focus": 9, "follow": [3, 9], "forc": 9, "force_divisible_bi": 9, "force_original_aspect_ratio": 9, "forg": 9, "form": 9, "format": [2, 3, 8, 9, 13], "formatt": [3, 4], "former": [8, 9], "found": [8, 9], "foundat": 13, "fp16": 9, "fp32": 9, "fpp": 4, "frame": [8, 9], "frame_num": [8, 9], "frame_sample_num": 8, "frame_sampling_method": [8, 9], "frames_per_second": 8, "free": 3, "frequenc": 10, "frequency_specified_field_selector": 3, "frequencyspecifiedfieldselector": [10, 13], "frequent": 9, "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "from_dict": 3, "from_xx": 3, "frustrat": 9, "fsspec": 3, "ftp": 9, "full": [8, 9], "func": 3, "function": [1, 6, 7, 9], "function_nam": 9, "further": 8, "futur": 9, "g": [2, 3, 4, 9], "game": 9, "gaussian": 9, "ge": [7, 9, 10], "gener": [3, 9], "generated_dataset_config": [4, 9], "generateqafromexamplesmapp": [9, 13], "generateqafromtextmapp": [9, 13], "geo": 9, "get": [1, 6], "get_divers": 1, "get_init_config": [2, 13], "get_read": 8, "get_sentences_from_docu": [6, 13], "get_split_key_fram": 9, "get_text_chunk": 9, "get_words_from_docu": [6, 13], "gib": 3, "git": 9, "github": 8, "give": [9, 13], "given": [3, 8, 9], "glean": 9, "glimps": 9, "global": [2, 4, 9], "global_arg": 9, "global_cfg": 4, "go": 4, "goal": 9, "googl": 8, "govern": 9, "gpt": 9, "gpt4": 9, "gpu": 3, "gram": 8, "grand": 9, "graph": 9, "graviti": 9, "greater": [8, 9, 10], "ground": 8, "group": [6, 8], "group_siz": 6, "gt": [3, 7, 8, 9, 10], "guarante": 3, "guid": 9, "guidanc": [9, 13], "guidance_scal": 9, "h": [4, 8, 9], "ha": 9, "haarcascade_frontalface_alt": [8, 9], "had": 9, "ham": 7, "hamming_dist": 7, "hand": 9, "hard": 2, "hash": [3, 5, 7], "have": [8, 9], "he": 9, "header": 9, "heartbeat": 9, "heaven": 9, "height": [8, 9], "help": 3, "here": [8, 9, 13], "hf": 9, "hf_ast": 9, "hf_blip": 8, "hf_clip": 8, "hf_diffus": 9, "hf_img2seq": 9, "hf_model": 9, "hf_nsfw_model": 8, "hf_owlvit": 8, "hf_scorer_model": 8, "hf_summar": 9, "hf_token": 8, "hf_video_blip": 9, "hf_watermark_model": 8, "hh": 4, "hi": 9, "high": [8, 9], "high_level_keyword": 9, "higher": 9, "him": 9, "histogram": 1, "histori": 9, "hk2": 9, "hk2t": 9, "home": 8, "homophon": 9, "hong": 9, "horizont": [8, 9], "horizontal_flip": [8, 9], "hostnam": 7, "how": 9, "hpp": 4, "html": [4, 8, 9], "http": [8, 9], "hub": 4, "hug": [8, 9], "hugginfac": 9, "huggingfac": [3, 4, 8, 9], "human": 9, "hzz": 8, "i": [2, 3, 4, 5, 6, 7, 8, 9], "id": 9, "idea": 9, "ident": 9, "identif": 8, "identifi": [8, 9], "ideolog": 9, "idiom": 9, "if_loop_prompt": 9, "ignor": [7, 9], "ignore_non_charact": 7, "ignore_pattern": 7, "ignore_special_charact": 9, "illus": 9, "imag": [1, 5, 7, 8, 9], "image_kei": 5, "imageaestheticsfilt": [8, 13], "imageaspectratiofilt": [8, 13], "imageblurmapp": [9, 13], "imagecaptioningfromgpt4vmapp": [9, 13], "imagecaptioningmapp": [9, 13], "imagededupl": [7, 13], "imagediffusionmapp": [9, 13], "imagefaceblurmapp": [9, 13], "imagefacecountfilt": [8, 13], "imagefaceratiofilt": [8, 13], "imagensfwfilt": [8, 13], "imagepairsimilarityfilt": [8, 13], "imageshapefilt": [8, 13], "imagesizefilt": [8, 13], "imagetaggingmapp": [9, 13], "imagetextmatchingfilt": [8, 13], "imagetextsimilarityfilt": [8, 13], "imagewatermarkfilt": [8, 13], "impact": 9, "implement": [3, 7], "implic": 9, "import": 9, "improv": 3, "in_memory_max_s": 3, "includ": [1, 3, 7, 8, 9], "incorrect": 9, "increas": 9, "independ": [3, 8, 9], "index": 13, "indic": [1, 9], "infer": 9, "influenc": 9, "info": [4, 5], "inform": [1, 3, 5, 7, 8, 9, 10], "init": 2, "init_config": [2, 13], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "inlin": 9, "input": [3, 5, 7, 8, 9, 10], "input_templ": 9, "input_text": 9, "insert": 9, "insert_random_char": 9, "instanc": 5, "instead": [4, 6], "instruct": 9, "int": [3, 4, 7, 8, 9, 10], "integ": 9, "intellig": 9, "interact": 9, "interfac": 9, "intermedi": [5, 7, 8], "interv": 3, "intric": 9, "introspect": 9, "invert": 6, "invok": 9, "involv": 9, "inward": 9, "iou": 8, "iou_thr": 8, "ipv4": 9, "ipv6": 9, "is_cuda_avail": [0, 14], "is_filt": 3, "item": [3, 5, 9], "iter": [3, 8, 9], "itm": 8, "its": [4, 5, 7, 9], "j": 4, "jaccard": 7, "jaccard_threshold": 7, "jaid": 8, "japanes": 9, "java": 4, "jaw": 9, "jl": 4, "join": 8, "join_char": 6, "jordan": 9, "jp2t": 9, "json": [2, 3, 4, 8], "json_ind": 2, "jsonargpars": [2, 3], "jsonformatt": [4, 13], "jsonl": [3, 4], "jsonnet": 2, "judg": 9, "kanji": 9, "karg": 3, "kb": 8, "kdd": 13, "keep": [3, 5, 7, 8, 9], "keep_alphabet": 9, "keep_candidate_mod": 9, "keep_hashes_in_res_d": 3, "keep_in_memori": 3, "keep_numb": 9, "keep_original_sampl": 9, "keep_punc": 9, "keep_stats_in_res_d": 3, "keep_tag_num": 9, "kei": [3, 4, 5, 8, 9, 10], "kept": [7, 8, 9], "kernel": 9, "kernel_s": 9, "keyboard": 9, "keyboard_error_char": 9, "keyfram": 8, "keyword": [8, 9], "keyword_kei": 9, "kib": 3, "kind": [8, 9], "knowledg": 9, "kong": 9, "kpyu": 9, "kwarg": [3, 4, 5, 7, 8, 9, 10], "ky\u016bjitai": 9, "l14": 8, "lai": 9, "lambda": 9, "lambda_str": 9, "lang": [8, 9], "lang_or_model": 1, "languag": [1, 7, 8, 9], "languageidscorefilt": [8, 13], "languages_to_detect": 8, "larg": 8, "large_area_ratio_thr": 8, "larger": [8, 9, 10], "largest": [8, 10], "last": [8, 9], "latex": 9, "latter": [8, 9], "le": [7, 9, 10], "lead": 9, "leader": 9, "leadership": 9, "learn": 9, "leav": 9, "left": [8, 9], "len": 9, "length": [3, 4, 8, 9], "less": [7, 8, 9, 10], "letter": 9, "level": [3, 5, 6, 7, 8, 9, 10], "lexic": 1, "librari": 9, "light_rag_extract": 9, "like": [3, 6, 7, 8, 9], "limit": 8, "line": [1, 2, 8, 9], "linearms": 8, "link": [3, 9], "list": [2, 3, 4, 5, 6, 8, 9], "liter": 9, "lkove": 9, "ll": 9, "llama3": 9, "llm": 9, "load": [1, 3, 4, 5, 9], "load_analysis_r": 3, "load_data_np": 3, "load_dataset": 4, "load_formatt": [4, 13], "load_from_disk": 3, "load_op": [5, 13], "local": 4, "localformatt": [4, 13], "localhost": 7, "locat": [8, 9], "lock": 9, "logo": 8, "long": 9, "longer": 8, "look": 9, "lot": 6, "love": 9, "low": 8, "lower": [6, 7, 8, 9, 10], "lower_cas": 6, "lower_percentil": 10, "lower_rank": 10, "lowercas": [6, 7, 9], "lsh": 7, "lua": 4, "luma_onli": 9, "m": [4, 9], "machin": 3, "macro": 9, "magnitud": 8, "mai": [8, 9], "main": [8, 9], "mainland": 9, "mainli": 3, "make": 9, "makefil": 4, "manag": [3, 9], "mani": 9, "manner": 3, "map": [3, 9], "mapper": [3, 5, 13], "mark": 9, "markdown": 4, "match": [7, 8, 9], "max": [3, 4, 7, 8, 9], "max_area_ratio": 8, "max_batch_s": 3, "max_col": 9, "max_dur": 8, "max_face_count": 8, "max_glean": 9, "max_height": [8, 9], "max_len": [8, 9], "max_num": 8, "max_ppl": 8, "max_ratio": [8, 9], "max_recal": 8, "max_sampl": 4, "max_scor": 8, "max_siz": 8, "max_snr": 8, "max_token": 9, "max_valu": 8, "max_width": [8, 9], "maximum": [3, 8, 9], "maximumlinelengthfilt": [8, 13], "maxsiz": 8, "mb": [3, 8], "md": 4, "md5": 7, "mean": [1, 3, 9], "measur": 9, "meet": [8, 9], "mem": 3, "member": 9, "memori": 3, "merg": [2, 4, 6, 8], "merge_config": [2, 13], "merge_on_whitespace_tab_newlin": [6, 13], "messag": 9, "meta": [2, 4], "metadata": [7, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "metric": [3, 5, 7, 8], "mib": 3, "middl": [8, 9], "might": [8, 9], "min": [3, 7, 8, 9], "min_action_num": 8, "min_area_ratio": 8, "min_col": 9, "min_content_v": 9, "min_delta_hsv": 9, "min_dependency_num": 8, "min_dur": 8, "min_face_count": 8, "min_frame_threshold": 9, "min_height": [8, 9], "min_last_split_dur": 9, "min_len": [8, 9], "min_num": 8, "min_ratio": [8, 9], "min_recal": 8, "min_repeat_sentence_length": 9, "min_scene_len": 9, "min_scor": 8, "min_siz": 8, "min_snr": 8, "min_valu": 8, "min_width": [8, 9], "minhash": 7, "minhashlsh": 7, "mini_action_num": 8, "mini_dependency_num": 8, "minim": 7, "minimum": [8, 9], "miss": 9, "mission": 9, "mit": 9, "mix": [4, 9], "mixtur": 4, "mixtureformatt": [4, 13], "mnb": 9, "modal": 13, "mode": [8, 9], "model": [1, 6, 7, 8, 9, 13], "model_func": 6, "model_param": 9, "modif": 3, "modul": [4, 13], "moment": 9, "monitor": [3, 13], "monitor_all_resourc": 3, "monitor_current_resourc": 3, "monitor_func": 3, "more": [3, 8, 9, 13], "most": [3, 9], "motion": 8, "multi": [8, 9, 10, 13], "multifil": 2, "multilin": 9, "multipl": [2, 3, 4, 6, 7, 8], "must": [4, 8, 9], "mutual": 9, "my": 3, "n": [6, 8, 9], "n1": 9, "n2": 9, "n3": 9, "n4": 9, "nalex": 9, "name": [1, 3, 4, 5, 8, 9], "namespac": [2, 3], "narrow": 9, "natur": 9, "nb": 9, "nearbi": 9, "necessari": 9, "need": [3, 6, 8, 9, 10], "neg": [7, 9], "nentity_typ": 9, "nest": 3, "nesteddataset": [3, 13], "new": [3, 4, 9], "new_cfg": 2, "new_lin": 6, "nexampl": 9, "nfc": 9, "nfd": 9, "nfkc": 9, "nfkd": 9, "nfor": 9, "nformat": 9, "ngiven": 9, "nicknam": 9, "nickname_kei": 9, "nit": 9, "nlpaug": 9, "nlpaugenmapp": [9, 13], "nlpcda": 9, "nlpcdazhmapp": [9, 13], "nm": 8, "nmf": 8, "nmf_iter_num": 8, "nod": 9, "node": 3, "nois": 9, "non": [6, 7, 9], "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "nonetyp": [7, 9, 10], "nonzero": 3, "noqa": 8, "normal": [8, 9], "note": 9, "notic": [3, 9], "noutput": 9, "now": [3, 6, 9], "nsfw": 8, "nsfw_image_detect": 8, "ntext": 9, "nthe": 9, "ntheir": 9, "nthen": 9, "ntogeth": 9, "null_valu": 4, "num": 9, "num_band": 7, "num_block": 7, "num_inference_step": 9, "num_permut": 7, "num_proc": [1, 3, 4], "num_rows_per_band": 7, "number": [1, 3, 4, 5, 7, 8, 9, 10], "numer": [8, 9], "nwhile": 9, "n\u4ed6\u4eec\u4e0d\u518d\u662f\u5355\u7eaf\u7684\u6267\u884c\u8005": 9, "n\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "n\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "n\u4f18\u5316\u540e\u7684\u95ee\u9898": 9, "n\u5982\u4e0b\u662f\u4e00\u6761\u793a\u4f8b\u6570\u636e": 9, "n\u6309\u7167\u4ee5\u4e0b\u683c\u5f0f\u8f93\u51fa": 9, "n\u6821\u51c6\u540e\u7684\u56de\u7b54": 9, "n\u6821\u51c6\u540e\u7684\u95ee\u9898": 9, "n\u751f\u6210\u7684\u65b0\u56de\u7b54": 9, "n\u751f\u6210\u8be5\u56de\u7b54\u7684\u539f\u56e0": 9, "n\u8981\u6c42": 9, "n\u8bf4\u660e": 9, "n\u968f\u7740\u4e0e\u534e\u76db\u987f\u7684\u901a\u8baf\u5728\u80cc\u666f\u4e2d\u55e1\u55e1\u4f5c\u54cd": 9, "n\u968f\u7740\u4e0e\u661f\u8fb0\u7684\u8054\u7cfb\u53d8\u5f97\u66f4\u52a0\u7262\u56fa": 9, "object": [1, 2, 3, 8], "objet": 8, "observ": 9, "obtain": [3, 6], "ocr": [8, 9], "ocr_error_char": 9, "oe": 9, "offer": 9, "offerd": 9, "offici": 8, "omit": 8, "one": [1, 2, 6, 7, 8, 9], "onli": [3, 7, 8, 9], "op": [3, 13], "op_nam": 3, "open": [5, 7, 9], "open_monitor": 3, "openai": 8, "opencv": [8, 9], "opened_aug_method": 9, "oper": [3, 8], "opt": 9, "optic": 8, "optim": [7, 9], "optimizeqamapp": [9, 13], "optimizequerymapp": [9, 13], "optimizeresponsemapp": [9, 13], "option": [1, 3, 4, 9], "order": [3, 9, 10], "org": [8, 9], "organ": 9, "ori_cfg": 2, "ori_config": 2, "origin": [2, 3, 8, 9], "other": [3, 8, 9], "otherwis": 9, "our": 9, "out": 8, "output": 9, "output_path": 1, "output_pattern": 9, "ov4": 9, "over": 9, "overal": 1, "overall_result": 1, "overallanalysi": [1, 3, 13], "overarch": 9, "overlap": [8, 9], "overlap_len": 9, "overrid": 3, "overrul": 8, "overwrit": [2, 9], "overwrite_output": 9, "ovl": 9, "owl": 8, "owlvit": 8, "own": 9, "p": 9, "packag": 4, "page": 13, "pai": 9, "pair": [3, 5, 7, 8, 9], "pairpreferencemapp": [9, 13], "palpabl": 9, "panda": 1, "paper": 8, "param": [1, 2, 4, 6, 7, 9], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "parent": 9, "parquet": [3, 4], "parquetformatt": [4, 13], "pars": [2, 9], "parse_output": 9, "parser": 2, "parser_mod": 2, "pass": [3, 9], "patch32": 8, "path": [1, 2, 3, 4, 7, 8, 9], "pattern": [7, 9], "paus": 9, "pdf": [4, 8], "peopl": 8, "percentil": [1, 10], "percept": 9, "perform": 3, "perl": 4, "permut": 7, "perplex": 8, "perplexityfilt": [8, 13], "person": 9, "perspect": 9, "phash": 7, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "phrase": 8, "phrasegroundingrecallfilt": [8, 13], "pipelin": 3, "pixel": 9, "pixel_divers": 9, "pixel_valu": 9, "pl": 4, "placehold": 9, "plai": 9, "pleas": [7, 9], "plot": 1, "pm": 4, "pod": 4, "point": 9, "port": 7, "portrai": 9, "posit": [7, 8, 9], "posix": 2, "post": 8, "postproc_func": 1, "postproc_kwarg": 1, "potenti": 9, "power": 9, "practic": 13, "pre": [3, 9], "precomput": 1, "predict": 8, "predictor": 8, "prefer": 9, "prepare_side_config": [2, 13], "present": 9, "prev_fram": 8, "previous": 3, "previous_d": 3, "prob": 7, "prob_threshold": 8, "probabl": [8, 9], "probe": 3, "probe_small_batch": 3, "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "process_batch": [5, 8, 9], "process_list": 5, "process_singl": [5, 7, 8, 9], "processed_d": 3, "produc": 9, "progress": 9, "project": 8, "prompt": 9, "prompt_kei": 9, "prompt_templ": 9, "properti": 4, "provid": [4, 7, 9], "ps1": 4, "psd1": 4, "psm1": 4, "punctuat": [6, 7, 9], "punctuationnormalizationmapp": [9, 13], "put": 9, "py": [3, 4], "pypi": 8, "pythia": 8, "python": [3, 9], "pythonfilemapp": [9, 13], "pythonlambdamapp": [9, 13], "pytorch": 8, "qa": 9, "qa_exampl": 9, "qa_pair": 9, "qa_pair_templ": 9, "qualiti": 9, "quantil": 1, "queri": 9, "query_attribut": 9, "query_ent": 9, "question": 9, "quieter": 9, "qwen": 9, "qwen1_5": 9, "qwen2": 9, "r": 4, "radiu": 9, "raft": 8, "rai": [4, 7, 9], "ram": 8, "ram_tag_list": 8, "random": [4, 9, 10], "random_ani": 9, "random_sampl": 4, "randomli": [4, 9], "randomselector": [10, 13], "rang": [3, 8, 9, 10], "rangespecifiedfieldselector": [10, 13], "rank": [3, 8, 9, 10], "rate": 8, "rather": 9, "ratio": [3, 4, 6, 8, 9, 10], "raw": [3, 9], "raw_output": 9, "raybasicdedupl": [7, 13], "raydocumentdedupl": [7, 13], "rayemptyformatt": [4, 9, 13], "rayimagededupl": [7, 13], "rayvideodedupl": [7, 13], "rb": 4, "readi": 9, "real": 9, "reason": 9, "reason_kei": 9, "rebellion": 9, "recal": 8, "recip": 3, "recogn": 8, "recommend": [7, 9], "record": 9, "record_delimit": 9, "recurr": 8, "recursively_chunk": 9, "red": 1, "redi": 7, "redis_host": 7, "redis_port": 7, "reduc": [5, 8, 9], "reduce_mod": 8, "refer": [8, 9], "reference_templ": 9, "refin": 6, "refine_single_column": 1, "regard": [7, 9], "region": 9, "regular": 9, "reject": 9, "rejected_kei": 9, "rejected_respons": 9, "rel": 8, "relat": [3, 8, 9], "relation_kei": 9, "relation_pattern": 9, "relationship": 9, "relationship_descript": 9, "relationship_keyword": 9, "relationship_strength": 9, "relev": 9, "relevant_char_kei": 9, "reluct": 9, "remot": [3, 9], "remoteformatt": [4, 13], "remov": [3, 5, 6, 8, 9], "remove_column": 3, "removebibliographymapp": [9, 13], "removecommentsmapp": [9, 13], "removeheadermapp": [9, 13], "removelongwordsmapp": [9, 13], "removenonchinesecharacterlmapp": [9, 13], "removerepeatsentencesmapp": [9, 13], "removespecificcharsmapp": [9, 13], "removetabletextmapp": [9, 13], "removewordswithincorrectsubstringsmapp": [9, 13], "rep_len": 8, "repeat": 9, "repetit": 8, "repl": 9, "replac": 9, "replace_equivalent_num": 9, "replace_homophone_char": 9, "replace_similar_word": 9, "replacecontentmapp": [9, 13], "repons": 9, "repositori": 4, "represent": 9, "request": [3, 9], "requir": [7, 8, 9, 10], "rescal": 8, "resiz": [8, 9], "resolut": [8, 9], "reson": 9, "resourc": 3, "resource_analysi": 3, "resource_util_dict": 3, "resource_util_list": 3, "respect": [1, 9], "respons": 9, "response_path": 9, "result": [1, 3, 8], "retain": [8, 9], "retri": 9, "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "rever": 9, "revers": [6, 10], "revis": 9, "rewrit": 9, "reykjavik": 9, "right": [8, 9], "rivera": 9, "roi": 9, "roi_kei": 9, "roi_str": 9, "roi_typ": 9, "role": 9, "row": 7, "rst": 4, "rule": [9, 10], "run": [3, 5, 8, 9], "runner": 8, "s2hk": 9, "s2t": 9, "s2tw": 9, "s2twp": 9, "s3": 3, "sac": 8, "said": 9, "salesforc": [8, 9], "sam": 9, "same": 9, "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "sample_algo": 3, "sample_data": 3, "sample_interv": 3, "sample_numb": 4, "sample_ratio": 3, "sampling_fp": 8, "sampling_param": 9, "save": [1, 2, 3], "save_path": 1, "save_stats_in_one_fil": 1, "save_to_disk": 3, "scala": 4, "scale": 9, "scene": 9, "scenedetect": 9, "schedul": 3, "score": [8, 9], "score_threshold": 8, "search": [9, 13], "second": [8, 9], "section": 3, "see": [3, 13], "seed": [4, 9], "seed_fil": 9, "seem": 9, "select": [3, 4, 5, 8, 9, 10], "select_column": 3, "select_num": 10, "select_ratio": 10, "selector": [5, 13], "semant": 9, "sens": 9, "sentenc": [6, 9], "sentencepiec": 7, "sentencesplitmapp": [9, 13], "separ": [6, 8, 9, 10], "sequenc": [8, 9], "sequenti": 9, "server": 7, "set": [2, 3, 6, 8, 9, 10], "setup_model": 8, "sever": [1, 3, 9], "sh": 4, "shape": 8, "shard": 3, "share": 9, "shift": [8, 9], "shingl": 7, "shinjitai": 9, "shorter": [8, 9], "should": [3, 7, 8, 9], "should_keep_long_word": 9, "should_keep_word_with_incorrect_substr": 9, "show": [1, 3, 9], "show_num": [3, 5, 7], "show_percentil": 1, "show_progress": 9, "shunk031": 8, "signific": 9, "significantli": 9, "silenc": 9, "simhash": 7, "similar": [7, 8, 9], "similar_on": 9, "similar_one_simhash": 9, "similarity_threshold": 9, "simpl": [8, 9], "simpli": 9, "simplifi": 9, "simul": 9, "sinc": 6, "singl": [1, 3, 9], "size": [3, 6, 7, 8, 9], "skip": [2, 3], "skip_check": 2, "skip_export": [1, 3], "skip_non": 2, "skip_return": 3, "slice": 9, "smali": 4, "small": [3, 8, 9], "smaller": [8, 9, 10], "smallest": 10, "snr": 8, "so": [7, 8, 9], "soften": 9, "some": [2, 9], "someth": 9, "sort": 10, "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "source_ent": 9, "space": [6, 7], "spec": 3, "special": [6, 8, 9], "specialcharactersfilt": [8, 13], "specif": [1, 3, 5, 7, 8, 9], "specifi": [3, 4, 6, 8, 9, 10], "specifiedfieldfilt": [8, 13], "specifiednumericfield": 8, "specifiednumericfieldfilt": [8, 13], "spectrogram": 9, "speed": 3, "spell": 9, "spelling_error_word": 9, "split": [3, 6, 9], "split_dur": 9, "split_on_newline_tab_whitespac": [6, 13], "split_on_whitespac": [6, 13], "split_pattern": 9, "split_random_word": 9, "split_text_by_punctu": [6, 13], "split_videos_by_dur": 9, "splite": 6, "sql": 4, "stabl": 9, "standard": 9, "start": 9, "stat": [1, 3, 5, 7, 8], "state": 9, "static": 3, "statu": 3, "std": 1, "stderr": 9, "step": 9, "still": 9, "stoical": 9, "stood": 9, "stop": 9, "stopword": [6, 8], "stopwords_dir": 8, "stopwordsfilt": [8, 13], "storage_opt": 3, "store": [1, 3, 4, 5, 7, 8, 9], "store_dir": 3, "stori": 9, "str": [2, 3, 4, 6, 7, 8, 9, 10], "stranger": 9, "strategi": [8, 9], "stream": 9, "strength": 9, "string": [2, 7, 8, 9], "strip": [6, 13], "strip_char": 6, "strip_charact": 6, "structur": 3, "studi": 9, "style": 2, "sub": [1, 6, 7], "subset": [3, 4], "substr": 9, "suffix": [4, 8], "suffixfilt": [8, 13], "suitabl": 9, "sum": 7, "summar": 9, "super": 9, "superset": 2, "support": [3, 8, 9], "support_text_kei": 9, "suppos": 9, "sure": 9, "survei": 9, "swap": 9, "swap_random_char": 9, "swap_random_word": 9, "sy": 8, "syllabl": 6, "system": [3, 9], "system_prompt": 9, "system_prompt_templ": 9, "t": [3, 4, 6, 7], "t2": 9, "t2hk": 9, "t2jp": 9, "t2tw": 9, "tab": 6, "tabl": [3, 9], "tag": [6, 8, 9], "tag_field_nam": [8, 9], "taiwan": 9, "taiwanes": 9, "take": 8, "take_batch": 3, "taken": 9, "talk": 9, "target": [3, 8, 9, 10], "target_ent": 9, "target_valu": 8, "task": 9, "taylor": 9, "team": 9, "tech": 9, "technolog": 9, "technologi": 9, "temperatur": 9, "templat": 9, "temporarili": [5, 7, 8], "term": 8, "tex": [4, 9], "text": [1, 4, 5, 6, 7, 8, 9], "text_kei": [3, 4, 5], "textactionfilt": [8, 13], "textchunkmapp": [9, 13], "textentitydependencyfilt": [8, 13], "textformatt": [4, 13], "textlengthfilt": [8, 13], "than": [4, 6, 7, 8, 9, 10], "thei": [7, 9], "them": [4, 7, 8, 9], "theme": 9, "thi": [3, 4, 5, 6, 7, 8, 9, 10], "think": 9, "those": [3, 8, 9], "threshold": [3, 7, 8, 9], "thresholddetector": 9, "through": 9, "tib": 3, "tiktoken": 9, "time": [3, 9], "timestamp": 3, "to_json": 3, "to_jsonl": 3, "to_parquet": 3, "togeth": [7, 9], "token": [6, 7, 8, 9], "token_func": 6, "tokenizer_model": 7, "tokennumfilt": [8, 13], "too": 9, "top": [8, 9, 10], "top_p": 9, "top_ratio": 10, "topic": 9, "topk": 10, "topk_specified_field_selector": 3, "topkspecifiedfieldselector": [10, 13], "torch_dtyp": 9, "torchvis": 8, "total": [8, 9], "toward": 9, "trace": [3, 5, 7], "trace_batch_mapp": 3, "trace_dedupl": 3, "trace_filt": 3, "trace_mapp": 3, "tracer": [3, 5, 7, 13], "tradit": 9, "train": [3, 9], "transform": [8, 9], "tree": [1, 8], "trepid": 9, "truce": 9, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "trust": 9, "trust_remote_cod": [8, 9], "try_num": 9, "tsv": 4, "tsvformatt": [4, 13], "tsx": 4, "tupl": 8, "tuple_delimit": 9, "tw2": 9, "tw2sp": 9, "tw2t": 9, "two": [3, 7, 8, 9], "txt": [4, 8], "type": [2, 3, 4, 9], "u": 9, "uers_prompt_kei": 9, "ulaanbaatar": 9, "un": 8, "uncanni": 9, "undercurr": 9, "underli": 9, "understand": 3, "understood": 9, "uneasi": 9, "unexpect": 9, "unfold": 9, "unicod": 9, "unifi": [3, 4], "unified_format_dataset": 4, "uniform": [3, 8, 9], "uniformli": [8, 9], "unknown": 9, "unless": 3, "unspoken": 9, "up": 9, "update_arg": 3, "upper": 10, "upper_percentil": 10, "upper_rank": 10, "uri": 3, "url": 9, "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "usabl": 3, "use_words_aug": [6, 8], "useless": 9, "user": 3, "user_prompt": 9, "user_prompt_kei": 9, "usual": 8, "util": [3, 8], "util_th": 3, "v1": 9, "v2": 8, "valu": [2, 3, 5, 7, 8, 9, 10], "var": [5, 7, 8], "variabl": 2, "variant": 9, "vb": 4, "version": [3, 9], "vertic": [8, 9], "vertical_flip": [8, 9], "vid_cap_from_frm_arg": 9, "vid_cap_from_vid_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "video": [5, 7, 8, 9], "video_kei": [5, 9], "video_manag": 9, "videoaestheticsfilt": [8, 13], "videoaspectratiofilt": [8, 13], "videocaptioningfromaudiomapp": [9, 13], "videocaptioningfromframesmapp": [9, 13], "videocaptioningfromsummarizermapp": [9, 13], "videocaptioningfromvideomapp": [9, 13], "videodedupl": [7, 13], "videodurationfilt": [8, 13], "videofaceblurmapp": [9, 13], "videoffmpegwrappedmapp": [9, 13], "videoframestextsimilarityfilt": [8, 13], "videomotionscorefilt": [8, 13], "videomotionscoreraftfilt": [8, 13], "videonsfwfilt": [8, 13], "videoocrarearatiofilt": [8, 13], "videoremovewatermarkmapp": [9, 13], "videoresizeaspectratiomapp": [9, 13], "videoresizeresolutionmapp": [9, 13], "videoresolutionfilt": [8, 13], "videosplitbydurationmapp": [9, 13], "videosplitbykeyframemapp": [9, 13], "videosplitbyscenemapp": [9, 13], "videotaggingfromaudiomapp": [9, 13], "videotaggingfromframesfilt": [8, 13], "videotaggingfromframesmapp": [9, 13], "videowatermarkfilt": [8, 13], "vietnames": [6, 8], "vision": [8, 9], "visison": 9, "vit": 8, "vllm": 9, "voic": 9, "w": [8, 9], "w1": 4, "w2": 4, "w3": 4, "wa": [3, 9], "wai": [6, 9], "watch": 9, "watermark": [8, 9], "watermark_detector": 8, "we": [3, 4, 7, 8, 9, 13], "weight": [4, 7, 9], "well": 9, "were": 9, "what": [8, 9], "whatev": 9, "when": [3, 4, 5, 7, 8, 9, 10], "where": 3, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "which": [3, 5, 7, 8, 9], "while": 8, "whitespac": [7, 9], "whitespace_charact": 9, "whitespacenormalizationmapp": [9, 13], "who": 9, "whole": [1, 8, 9], "whose": [2, 8, 9], "why": 9, "width": [8, 9], "wight": 9, "wiki": 9, "wikipedia": 9, "wills": 9, "window": [1, 7], "window_s": 7, "window_width": 9, "within": [8, 9, 10], "without": [6, 9], "won": [3, 7], "word": [6, 8, 9], "wordless": 9, "wordrepetitionfilt": [8, 13], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "words_augment": [6, 13], "words_refin": [6, 13], "wordsnumfilt": [8, 13], "work": [3, 8, 9], "work_dir": 3, "worker": 3, "workload": 3, "would": 9, "wrapper": 9, "write": 9, "www": 8, "x1": 9, "x2": 9, "xinyu1205": 8, "xml": [4, 8, 9], "xxx": 3, "y1": 9, "y2": 9, "yaml": [2, 9], "ye": 9, "yml": 2, "you": 9, "your": 9, "youth": 9, "z": 9, "zh": [6, 8], "zsh": 4, "zst": 4, "\u4e00\u573a\u8715\u53d8\u5df2\u7ecf\u5f00\u59cb": 9, "\u4e00\u79cd\u65b0\u7684\u51b3\u5fc3": 9, "\u4e00\u80a1\u4e0d\u7965\u7684\u6c14\u606f\u7b3c\u7f69\u7740\u4ed6\u4eec": 9, "\u4e00\u81f4": 9, "\u4e0d\u4e00\u5b9a\u8981\u5c40\u9650\u4e8e\u8f93\u5165": 9, "\u4e0d\u518d\u4ec5\u4ec5\u662f\u89c2\u5bdf\u548c\u62a5\u544a": 9, "\u4e0d\u80fd\u4e0e\u8f93\u5165\u7684": 9, "\u4e0d\u8981\u5305\u542b\u4e3b\u89c2\u770b\u6cd5": 9, "\u4e0d\u8981\u641e\u53cd\u4e86": 9, "\u4e0d\u8981\u6dfb\u52a0\u6587\u672c\u4e2d\u6ca1\u6709\u7684\u60c5\u8282": 9, "\u4e0d\u8981\u8f93\u51fa\u5176\u4ed6\u591a\u4f59\u5185\u5bb9": 9, "\u4e0d\u8981\u8f93\u51fa\u591a\u4f59\u5185\u5bb9": 9, "\u4e0d\u8981\u9057\u6f0f\u60c5\u8282\u7684\u4e3b\u8981\u4eba\u7269": 9, "\u4e14\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u4e14\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u4e4b\u524d": 9, "\u4e5f\u53ef\u4ee5\u662f\u591a\u8f6e": 9, "\u4e8b\u5b9e\u6027": 9, "\u4eba\u7269": 9, "\u4eba\u72691": 9, "\u4eba\u72692": 9, "\u4eba\u72693": 9, "\u4eba\u7269\u8eab\u4efd": 9, "\u4ece\u6587\u672c\u4e2d\u603b\u7ed3": 9, "\u4ece\u88ab\u52a8\u63a5\u53d7\u8005\u8f6c\u53d8\u4e3a\u79ef\u6781\u53c2\u4e0e\u8005": 9, "\u4ed6\u4eec\u5728\u63a5\u4e0b\u6765\u51e0\u4e2a\u5c0f\u65f6\u5185\u505a\u51fa\u7684\u51b3\u5b9a\u53ef\u80fd\u4f1a\u91cd\u65b0\u5b9a\u4e49\u4eba\u7c7b\u5728\u5b87\u5b99\u4e2d\u7684\u4f4d\u7f6e": 9, "\u4ed6\u4eec\u5df2\u6210\u4e3a\u67d0\u4e2a\u8d85\u8d8a\u661f\u8fb0\u4e0e\u6761\u7eb9\u7684\u9886\u57df\u7684\u4fe1\u606f\u5b88\u62a4\u8005": 9, "\u4ee3\u8868\u6027\u793a\u4f8b": 9, "\u4ee3\u8868\u6027\u793a\u4f8b1": 9, "\u4ee3\u8868\u6027\u793a\u4f8b2": 9, "\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "\u4efb\u52a1": 9, "\u4efb\u52a1\u6f14\u53d8": 9, "\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u4f46\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u4f46\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u4f46\u4ecd\u7136\u9700\u8981\u7b26\u5408\u4e8b\u5b9e": 9, "\u4f46\u662f\u9700\u8981\u4fdd\u6301\u683c\u5f0f\u76f8\u540c": 9, "\u4f60\u597d": 9, "\u4f60\u7684\u4efb\u52a1\u662f\u5c06\u4eba\u7269\u4e4b\u95f4\u7684\u79f0\u547c\u65b9\u5f0f": 9, "\u4f60\u7684\u4efb\u52a1\u662f\u6839\u636e\u53c2\u8003\u4fe1\u606f\u4fee\u6539\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u4f7f": 9, "\u4f7f\u5176\u66f4\u52a0\u8be6\u7ec6": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "\u51b3\u7b56": 9, "\u51b3\u7b56\u5236\u5b9a": 9, "\u51c6\u786e": 9, "\u5219\u4ee5\u4ed6\u4eec\u5927\u80c6\u7684\u65b0\u9891\u7387\u9707\u52a8": 9, "\u534e\u76db\u987f": 9, "\u534e\u76db\u987f\u662f\u6b63\u5728\u63a5\u6536\u901a\u8baf\u7684\u5730\u65b9": 9, "\u539f\u56e0": 9, "\u53c2\u8003\u4fe1\u606f": 9, "\u53ea\u62bd\u53d6\u60c5\u8282\u4e2d\u7684\u4e3b\u8981\u4eba\u7269": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u6821\u51c6\u540e\u7684\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u6821\u51c6\u540e\u7684\u95ee\u9898": 9, "\u53ef\u80fd\u662f\u591a\u8f6e\u5bf9\u8bdd": 9, "\u548c": 9, "\u56de\u7b54": 9, "\u56e2\u961f": 9, "\u56e2\u961f\u6536\u5230\u6765\u81ea\u534e\u76db\u987f\u7684\u901a\u8baf": 9, "\u56e2\u961f\u7684\u4efb\u52a1\u5df2\u7ecf\u6f14\u53d8": 9, "\u56e2\u961f\u76f4\u63a5\u53c2\u4e0e\u675c\u5c14\u585e\u884c\u52a8": 9, "\u56e2\u961f\u7ad9\u7acb\u7740": 9, "\u56e2\u961f\u88ab\u63cf\u7ed8\u6210\u4e00\u7fa4\u4ece\u88ab\u52a8\u89c2\u5bdf\u8005\u8f6c\u53d8\u4e3a\u79ef\u6781\u53c2\u4e0e\u8005\u7684\u4eba": 9, "\u5728\u8bed\u8a00\u98ce\u683c": 9, "\u5730\u70b9": 9, "\u5916\u90e8\u5f71\u54cd": 9, "\u5b83\u9700\u8981\u4e00\u79cd\u65b0\u7684\u89c6\u89d2": 9, "\u5b87\u5b99\u610f\u4e49": 9, "\u5bf9": 9, "\u5bf9\u6587\u672c\u7684\u60c5\u8282\u8fdb\u884c\u5206\u70b9\u603b\u7ed3": 9, "\u5bf9\u8bdd\u4e2d\u7684\u7d27\u5f20\u60c5\u7eea\u901a\u8fc7\u561f\u561f\u58f0\u548c\u9759\u7535\u566a\u97f3\u8d2f\u7a7f\u59cb\u7ec8": 9, "\u5bf9\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u5c06\u5176\u66f4\u52a0\u8be6\u7ec6\u5177\u4f53": 9, "\u5c0f\u7ec4\u5f00\u59cb\u5904\u7406\u9010\u6e10\u6210\u5f62\u7684\u8b66\u544a": 9, "\u5c3d\u91cf\u4e0d\u8981\u9057\u6f0f\u5185\u5bb9": 9, "\u5c55\u793a\u4e86\u4ed6\u4eec\u89d2\u8272\u7684\u52a8\u6001\u53d8\u5316": 9, "\u5e76\u4e14\u4ece\u539f\u6587\u6458\u5f55\u6700\u80fd\u8bf4\u660e\u8be5": 9, "\u5e76\u62bd\u53d6\u4e0e\u60c5\u8282\u76f8\u5173\u7684\u4eba\u7269": 9, "\u5fc5\u987b\u6210\u5bf9\u51fa\u73b0": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f\u8f93\u51fa": 9, "\u603b\u7ed3\u51fa\u76f8\u5e94\u89c4\u77e9": 9, "\u603b\u7ed3\u683c\u5f0f\u5982\u4e0b": 9, "\u60c5\u8282": 9, "\u60c5\u82821": 9, "\u60c5\u82822": 9, "\u60c5\u82823": 9, "\u60c5\u8282\u63cf\u8ff0": 9, "\u6216\u8005\u5c06\u4ed6\u4eec\u7f6e\u4e8e\u65e0\u77e5\u548c\u6f5c\u5728\u5371\u9669\u4e4b\u4e2d": 9, "\u6267\u884c\u5176\u6f14\u53d8\u540e\u7684\u76ee\u6807\u548c\u6d3b\u52a8": 9, "\u6280\u672f": 9, "\u6309\u7167\u4f60\u7684\u7406\u89e3": 9, "\u63cf\u8ff0": 9, "\u63d0\u4f9b\u7684": 9, "\u63d0\u53d6\u51fa\u6765": 9, "\u6458\u5f55\u7684\u793a\u4f8b\u5e94\u8be5\u7b80\u77ed": 9, "\u6587\u672c": 9, "\u65b0\u751f\u6210\u7684": 9, "\u6635\u79f0": 9, "\u663e\u7136": 9, "\u663e\u793a\u51fa\u76ee\u6807\u548c\u6d3b\u52a8\u7684\u91cd\u5927\u8f6c\u53d8": 9, "\u675c\u5c14\u585e\u884c\u52a8": 9, "\u675c\u5c14\u585e\u884c\u52a8\u88ab\u63cf\u8ff0\u4e3a\u4e00\u9879\u5df2\u6f14\u53d8\u4e3a\u4e92\u52a8\u548c\u51c6\u5907\u7684\u4efb\u52a1": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u6885\u745f\u540e\u6765\u7684\u76f4\u89c9\u5360\u636e\u4e86\u4e0a\u98ce": 9, "\u6ce8\u610f": 9, "\u6ce8\u610f\u76f8\u5173\u4eba\u7269\u9700\u8981\u5728\u5bf9\u5e94\u60c5\u8282\u4e2d\u51fa\u73b0": 9, "\u6ce8\u610f\u8981\u5c3d\u53ef\u80fd\u4fdd\u7559\u6587\u672c\u7684\u4e13\u6709\u540d\u8bcd": 9, "\u7136\u540e\u5199\u51fa\u4e00\u4e2a\u65b0\u7684": 9, "\u751f\u6210\u7684": 9, "\u7684": 9, "\u7684\u4ee3\u8868\u6027\u793a\u4f8b": 9, "\u7684\u539f\u6587\u6458\u5f551": 9, "\u7684\u539f\u6587\u6458\u5f552": 9, "\u7684\u6635\u79f0": 9, "\u7684\u8bdd\u9898\u6216\u9886\u57df": 9, "\u76f4\u63a5\u8f93\u51fa\u4f18\u5316\u540e\u7684\u95ee\u7b54\u5bf9": 9, "\u76f8\u5173\u4eba\u7269": 9, "\u76f8\u540c\u7684\u8bf4\u8bdd\u4eba\u548c\u88ab\u79f0\u547c\u4eba\u6700\u591a\u7ed9\u51fa\u4e00\u4e2a\u6700\u5e38\u7528\u7684\u79f0\u547c": 9, "\u79ef\u6781\u53c2\u4e0e": 9, "\u79f0\u547c\u65b9\u5f0f": 9, "\u79f0\u547c\u65b9\u5f0f1": 9, "\u79f0\u547c\u65b9\u5f0f2": 9, "\u79f0\u547c\u65b9\u5f0f3": 9, "\u7acb\u573a\u7b49\u4efb\u4e00\u65b9\u9762\u4e0e\u539f\u56de\u7b54\u76f8\u53cd": 9, "\u7b26\u5408\u539f\u6587\u4e8b\u5b9e": 9, "\u7ec4\u7ec7": 9, "\u7ed9\u5b9a\u4e00\u6bb5\u6587\u672c": 9, "\u7ed9\u5b9a\u4f60\u4e00\u6bb5\u6587\u672c": 9, "\u800c": 9, "\u800c\u4e14": 9, "\u800c\u662f\u4e92\u52a8\u548c\u51c6\u5907": 9, "\u8054\u7cfb\u4e0a\u4e0b\u6587\u8bf4\u660e\u524d\u56e0\u540e\u679c": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "\u8868\u660e\u5176\u5728\u51b3\u7b56\u8fc7\u7a0b\u4e2d\u7684\u91cd\u8981\u6027": 9, "\u88ab\u79f0\u547c\u4eba": 9, "\u8be5": 9, "\u8bf4\u8bdd\u4eba": 9, "\u8bf7\u4e0d\u8981\u8f93\u51fa\u4e92\u76f8\u6ca1\u6709\u6635\u79f0\u7684\u79f0\u547c\u65b9\u5f0f": 9, "\u8bf7\u4f18\u5316\u8f93\u5165\u7684\u95ee\u7b54\u5bf9": 9, "\u8bf7\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u8bf7\u4f60\u4ed4\u7ec6\u89c2\u5bdf\u591a\u4e2a\u793a\u4f8b\u6570\u636e\u7684\u8f93\u5165\u548c\u8f93\u51fa": 9, "\u8bf7\u6839\u636e\u63d0\u4f9b\u7684": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "\u8f93\u51fa\u683c\u5f0f\u5982\u4e0b": 9, "\u8fd9\u4e00\u4f7f\u547d\u7684\u63d0\u5347\u4e0d\u80fd\u88ab\u89c4\u5219\u548c\u65e2\u5b9a\u534f\u8bae\u6240\u675f\u7f1a": 9, "\u8fd9\u5f71\u54cd\u4e86\u4ed6\u4eec\u7684\u51b3\u7b56\u8fc7\u7a0b": 9, "\u8fd9\u79cd\u57fa\u8c03\u4e0d\u662f\u7531\u4e16\u4fd7\u8bbe\u5b9a\u7684": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fdb\u884c\u6821\u51c6": 9, "\u9075\u5faa\u5982\u4e0b\u7684\u56de\u590d\u683c\u5f0f": 9, "\u90fd\u66f4\u52a0\u8be6\u7ec6": 9, "\u95ee\u9898": 9, "\u9700\u8981\u5728": 9, "\u9700\u8981\u6b63\u786e\u56de\u7b54\u751f\u6210\u7684": 9, "\u9700\u8981\u6ee1\u8db3\u5982\u4e0b\u8981\u6c42": 9, "\u9700\u8981\u7ed9\u51fa\u8bf4\u8bdd\u4eba\u5bf9\u88ab\u79f0\u547c\u4eba\u7684\u79f0\u547c": 9}, "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "titleterms": {"": 13, "analysi": 1, "api": 13, "common": 6, "config": 2, "core": 3, "data": 13, "data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "dedupl": 7, "document": 13, "filter": 8, "format": 4, "indic": 13, "juicer": 13, "mapper": 9, "op": [5, 6, 7, 8, 9, 10], "refer": 13, "selector": 10, "tabl": 13, "tool": 11, "tutori": 13, "util": 12, "welcom": 13}}) \ No newline at end of file