diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html index f20745f11..c8a2a259f 100644 --- a/_modules/data_juicer/core/data.html +++ b/_modules/data_juicer/core/data.html @@ -325,9 +325,10 @@

Source code for data_juicer.core.data

 
         if inspect.ismethod(called_func):
             # batched is required for fault-tolerant or batched OP
-            if not called_func.__self__.turbo or hasattr(
+            if callable(getattr(
                     called_func.__self__,
-                    'is_batched_op') and called_func.__self__.is_batched_op():
+                    'is_batched_op')) and called_func.__self__.is_batched_op(
+                    ) or not called_func.__self__.turbo:
                 kargs['batched'] = True
                 kargs['batch_size'] = kargs.pop('batch_size', 1) if hasattr(
                     called_func.__self__, 'is_batched_op'
@@ -335,6 +336,12 @@ 

Source code for data_juicer.core.data

             else:
                 kargs['batched'] = False
 
+            # rank is required for cuda model loading
+            if callable(
+                    getattr(called_func.__self__,
+                            'use_cuda')) and called_func.__self__.use_cuda():
+                kargs['with_rank'] = True
+
         if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None:
             new_fingerprint = generate_fingerprint(self, *args, **kargs)
             kargs['new_fingerprint'] = new_fingerprint
@@ -379,10 +386,12 @@ 

Source code for data_juicer.core.data

             called_func = called_func.__wrapped__
 
         # Batched is always required for fault tolerance
-        if inspect.ismethod(
-                called_func) and called_func.__self__.is_batched_op():
-            kargs['batched'] = True
-            kargs['batch_size'] = kargs.pop('batch_size', 1)
+        if inspect.ismethod(called_func):
+            if callable(getattr(
+                    called_func.__self__,
+                    'is_batched_op')) and called_func.__self__.is_batched_op():
+                kargs['batched'] = True
+                kargs['batch_size'] = kargs.pop('batch_size', 1)
 
         if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None:
             new_fingerprint = generate_fingerprint(self, *args, **kargs)
diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html
index d6b0bf3f8..d736b81d0 100644
--- a/_modules/data_juicer/ops/base_op.html
+++ b/_modules/data_juicer/ops/base_op.html
@@ -81,6 +81,7 @@ 

Source code for data_juicer.ops.base_op

 import traceback
 from functools import wraps
 
+import numpy as np
 import pyarrow as pa
 from loguru import logger
 
@@ -212,6 +213,11 @@ 

Source code for data_juicer.ops.base_op

         self.image_key = kwargs.get('image_key', 'images')
         self.audio_key = kwargs.get('audio_key', 'audios')
         self.video_key = kwargs.get('video_key', 'videos')
+
+        self.query_key = kwargs.get('query_key', 'query')
+        self.response_key = kwargs.get('response_key', 'response')
+        self.history_key = kwargs.get('history_key', 'history')
+
         self.batch_size = kwargs.get('batch_size', 1000)
 
         # whether the model can be accelerated using cuda
@@ -289,6 +295,9 @@ 

Source code for data_juicer.ops.base_op

             dataset = NestedDataset(dataset)
         return dataset
 
+    def empty_history(self):
+        return np.empty((0, 0), dtype=str)
+
 
 
[docs]class Mapper(OP): diff --git a/_modules/data_juicer/ops/mapper/extract_qa_mapper.html b/_modules/data_juicer/ops/mapper/extract_qa_mapper.html deleted file mode 100644 index 4984076cb..000000000 --- a/_modules/data_juicer/ops/mapper/extract_qa_mapper.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.extract_qa_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.extract_qa_mapper

-import json
-import re
-from typing import Dict, Optional
-
-from loguru import logger
-
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-torch = LazyLoader('torch', 'torch')
-vllm = LazyLoader('vllm', 'vllm')
-
-OP_NAME = 'extract_qa_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class ExtractQAMapper(Mapper): - """ - Mapper to extract question and answer pair from text samples. - Recommended model list: [ - 'alibaba-pai/pai-llama3-8b-doc2qa', - 'alibaba-pai/pai-baichuan2-7b-doc2qa', - 'alibaba-pai/pai-qwen1_5-4b-doc2qa', - 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - 'alibaba-pai/pai-qwen1_5-1b8-doc2qa', - 'alibaba-pai/pai-qwen1_5-0b5-doc2qa' - ] - These recommended models are all trained with Chinese data - and are suitable for Chinese. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - trust_remote_code: bool = False, - pattern: Optional[str] = None, - qa_format: str = 'chatml', - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - *args, - **kwargs): - """ - Initialization method. - :param hf_model: Hugginface model id. - :param trust_remote_code: passed to transformers - :param pattern: regular expression pattern to search for within text. - :param qa_format: Output format of question and answer pair. - :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. - :param sampling_params: Sampling parameters for text generation. - e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args - - The default data format parsed by this interface is as follows: - Model Input: - 蒙古国的首都是乌兰巴托(Ulaanbaatar) - 冰岛的首都是雷克雅未克(Reykjavik) - Model Output: - 蒙古国的首都是乌兰巴托(Ulaanbaatar) - 冰岛的首都是雷克雅未克(Reykjavik) - Human: 请问蒙古国的首都是哪里? - Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 - Human: 冰岛的首都是哪里呢? - Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 - ... - """ - - super().__init__(*args, **kwargs) - self.num_proc = 1 - - if pattern is None: - self.pattern = r'Human: (.*?)\nAssistant: (.*?)(?=\nHuman|$)' - else: - self.pattern = pattern - - self.qa_format = qa_format - self.enable_vllm = enable_vllm - - if enable_vllm: - - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') - self.model_key = prepare_model( - model_type='vllm', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) - self.sampling_params = vllm.SamplingParams(**sampling_params) - else: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) - self.sampling_params = sampling_params
- - def _extract_qa(self, output): - """Extract qestion and answer pair from model output response.""" - qa_list = [] - - pat = re.compile(self.pattern, re.DOTALL) - qa_pairs = pat.findall(output) - - for _, qa in enumerate(qa_pairs, 1): - user, assistant = qa - qa_list.append((user.strip(), assistant.strip())) - - return qa_list - -
[docs] def process_single(self, sample, rank=None): - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - if self.enable_vllm: - response = model.generate([sample[self.text_key]], - self.sampling_params) - output = response[0].outputs[0].text - else: - inputs = processor(sample[self.text_key], - return_tensors='pt').to(model.device) - response = model.generate(**inputs, **self.sampling_params) - output = processor.decode(response.cpu()[0], - skip_special_tokens=True) - - qa_list = self._extract_qa(output) - - if not len(qa_list): - logger.info( - 'No question and answer data was extracted from this sample!') - - dialogue_data = [] - if self.qa_format == 'chatml': - for qa in qa_list: - dialogue_data.append({ - 'messages': [{ - 'role': 'user', - 'content': qa[0] - }, { - 'role': 'assistant', - 'content': qa[1] - }] - }) - else: - raise ValueError(f'Not support {self.qa_format}!') - - sample[self.text_key] = json.dumps(dialogue_data, ensure_ascii=False) - - return sample
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/generate_instruction_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html similarity index 50% rename from _modules/data_juicer/ops/mapper/generate_instruction_mapper.html rename to _modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html index e97503fa1..455dfa757 100644 --- a/_modules/data_juicer/ops/mapper/generate_instruction_mapper.html +++ b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html @@ -5,7 +5,7 @@ - data_juicer.ops.mapper.generate_instruction_mapper — data_juicer 0.2.0 documentation + data_juicer.ops.mapper.generate_qa_from_examples_mapper — data_juicer 0.2.0 documentation @@ -67,7 +67,7 @@
  • - +
  • @@ -76,7 +76,7 @@
    -

    Source code for data_juicer.ops.mapper.generate_instruction_mapper

    +  

    Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

     import json
     import random
     import re
    @@ -94,26 +94,15 @@ 

    Source code for data_juicer.ops.mapper.generate_instruction_mapper

    vllm = LazyLoader('vllm', 'vllm') rouge = LazyLoader('rouge', 'rouge') -DEFAULT_PROMPT_TEMPLATE = """ -请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求: -1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。 -2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。 -3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。 -4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。 -{augmented_data} -""" -QA_EXTRACTION_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*?)\s*(?=【问题】|$)' -EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n\n{qa_pairs}' -QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' - -OP_NAME = 'generate_instruction_mapper' +OP_NAME = 'generate_qa_from_examples_mapper' # TODO: Extend LLM-based OPs into API-based implementation. -
    [docs]@UNFORKABLE.register_module(OP_NAME) +
    [docs]@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) -class GenerateInstructionMapper(Mapper): - """Mapper to generate new instruction text data. +class GenerateQAFromExamplesMapper(Mapper): + """ + Mapper to generate question and answer pairs from examples. You should configure an empty dataset in your yaml config file: ``` generated_dataset_config: @@ -124,161 +113,148 @@

    Source code for data_juicer.ops.mapper.generate_instruction_mapper

    The number of samples generated is determined by the length of the empty dataset. """ + + DEFAULT_SYSTEM_PROMPT = ( + '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。' + '注意,新生成的【问题】和【回答】需要满足如下要求:\n' + '1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n' + '2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n' + '3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n' + '4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n') + + DEFAULT_INPUT_TEMPLATE = '{}' + DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}' + DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' + DEFAULT_OUTPUT_PATTERN = r'【问题】(.*?)【回答】(.*?)(?=【问题】|$)' + _accelerator = 'cuda' -
    [docs] def __init__(self, - hf_model: str = 'Qwen/Qwen-7B-Chat', +
    [docs] def __init__(self, + hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', + *, seed_file: str = '', - instruct_num: PositiveInt = 3, - trust_remote_code: bool = False, + example_num: PositiveInt = 3, similarity_threshold: float = 0.7, - prompt_template: Optional[str] = None, - qa_pair_template: Optional[str] = None, + system_prompt: Optional[str] = None, + input_template: Optional[str] = None, example_template: Optional[str] = None, - qa_extraction_pattern: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - *args, + qa_pair_template: Optional[str] = None, + output_pattern: Optional[str] = None, + enable_vllm: bool = False, + model_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, **kwargs): """ Initialization method. - :param hf_model: Hugginface model id. - :param seed_file: Seed file path, chatml format. - :param instruct_num: The number of instruction samples. - Randomly select N samples from "seed_file" and - put them into prompt as instruction samples. - :param trust_remote_code: passed to transformers + :param hf_model: Hugginface model ID. + :param seed_file: Path to the seed file in chatml format. + :param example_num: The number of selected examples. + Randomly select N examples from "seed_file" and + put them into prompt as QA examples. :param similarity_threshold: The similarity score threshold - between the generated samples and the seed samples. + between the generated samples and the seed examples. Range from 0 to 1. Samples with similarity score less than this threshold will be kept. - :param prompt_template: Prompt template for generate samples. - Please make sure the template contains "{augmented_data}", - which corresponds to the augmented samples. - :param qa_pair_template: Prompt template for generate question - and answer pair description. Please make sure the template - contains two "{}" to format question and answer. - Default: '【问题】\n{}\n【回答】\n{}\n'. - :param example_template: Prompt template for generate examples. - Please make sure the template contains "{qa_pairs}", which - corresponds to the question and answer pair description - generated by param `qa_pair_template`. - Default: '\n如下是一条示例数据:\n\n{qa_pairs}' - :param qa_extraction_pattern: Regular expression pattern for parsing - question and answer from model response. + :param system_prompt: System prompt for guiding the generation task. + :param input_template: Template for building the input prompt. It must + include one placeholder '{}', which will be replaced by + `example_num` formatted examples defined by `example_template`. + :param example_template: Template for formatting one QA example. It + must include one placeholder '{}', which will be replaced by one + formatted qa_pair. + :param qa_pair_template: Template for formatting a single QA pair + within each example. Must include two placeholders '{}' for the + question and answer. + :param output_pattern: Regular expression pattern to extract questions + and answers from model response. :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. + :param model_params: Parameters for initializing the model. :param sampling_params: Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args + :param kwargs: Extra keyword arguments. """ - super().__init__(*args, **kwargs) - self.num_proc = 1 + super().__init__(**kwargs) if not seed_file: raise ValueError( 'Please provide `seed_file` in chatml format.' 'Example: data-juicer/demos/data/demo-dataset-chatml.jsonl') - self.instruct_num = instruct_num + self.seed_file = seed_file + self.example_num = example_num self.similarity_threshold = similarity_threshold self.similarity_type = 'rouge_l' - if prompt_template is None: - prompt_template = DEFAULT_PROMPT_TEMPLATE - if qa_pair_template is None: - qa_pair_template = QA_PAIR_TEMPLATE - if example_template is None: - example_template = EXAMPLE_TEMPLATE - if qa_extraction_pattern is None: - qa_extraction_pattern = QA_EXTRACTION_PATTERN - - self.prompt_template = prompt_template - self.qa_pair_template = qa_pair_template - self.example_template = example_template - self.qa_extraction_pattern = qa_extraction_pattern + self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT + self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE + self.example_template = example_template or self.DEFAULT_EXAMPLE_TEMPLATE # noqa: E501 + self.qa_pair_template = qa_pair_template or \ + self.DEFAULT_QA_PAIR_TEMPLATE + self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN self.enable_vllm = enable_vllm + model_params = model_params or {} + sampling_params = sampling_params or {} if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: tensor_parallel_size = torch.cuda.device_count() logger.info(f'Set tensor_parallel_size to \ {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size self.model_key = prepare_model( model_type='vllm', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) + **model_params) self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) + return_pipe=True, + **model_params) self.sampling_params = sampling_params - self.seed_qa_samples = self.load_seed_qa_samples(seed_file) - + self.seed_qa_samples = self._load_seed_qa_samples() if len(self.seed_qa_samples) == 0: - raise ValueError('No QA data was parsed from the seed file!') + raise ValueError('No QA data was parsed from the seed file!')
    - self.reference_samples = [ - '\n'.join(['\n'.join(qa_pair) for qa_pair in qa_pairs]) + '\n' - for qa_pairs in self.seed_qa_samples - ]
    - -
    [docs] def load_seed_qa_samples(self, seed_file): + def _load_seed_qa_samples(self): """Load QA pairs from chatml format file.""" qa_samples = [] - with open(seed_file) as f: + with open(self.seed_file, encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.strip() - qa_pairs = self.parse_chatml_str(line) + qa_pairs = self._parse_chatml_str(line) if len(qa_pairs) > 0: qa_samples.append(qa_pairs) + return qa_samples - return qa_samples
    - -
    [docs] def build_prompt(self, qa_samples, prompt_template): + def _sample_to_str(self, qa_sample): + return '\n'.join(['\n'.join(qa_pair) for qa_pair in qa_sample]) + '\n' - def format_qa_pairs(qa_pairs): - return ''.join([ - self.qa_pair_template.format(q, a) for q, a in qa_pairs - if q and a - ]) - - body_fragments = [ - self.example_template.format(qa_pairs=format_qa_pairs(qa_pairs)) - for qa_pairs in qa_samples - ] - - body = ''.join(body_fragments) - - return prompt_template.format(augmented_data=body)
    + def _max_rouge_l_score(self, hypothesis, references): + r = rouge.Rouge() + max_score = 0.0 + hyp_str = self._sample_to_str(hypothesis) + for reference in references: + ref_str = self._sample_to_str(reference) + scores = r.get_scores(hyp_str, ref_str) + rouge_l_score = scores[0]['rouge-l']['f'] + if rouge_l_score > max_score: + max_score = rouge_l_score + return max_score -
    [docs] def parse_chatml_str(self, input_str): + def _parse_chatml_str(self, sample_str): user_input = None assistant_output = None qa_pairs = [] - data = json.loads(input_str) + data = json.loads(sample_str) for message in data['messages']: role = message['role'] content = message['content'] @@ -287,79 +263,91 @@

    Source code for data_juicer.ops.mapper.generate_instruction_mapper

    elif role == 'assistant': assistant_output = content qa_pairs.append((user_input, assistant_output)) - return qa_pairs
    - -
    [docs] def parse_response(self, response_str): - pattern = self.qa_extraction_pattern - matches = re.findall(pattern, response_str, re.DOTALL) - response_str = '' - out_qa_pairs = [] - for i, match in enumerate(matches): - question, answer = match - question = question.strip() - answer = answer.strip() - out_qa_pairs.append((question, answer)) - response_str += question + '\n' + answer + '\n' + return qa_pairs - if len(out_qa_pairs) == 0: - logger.error('Parse model response error! ' - 'No data generated for the current response!') +
    [docs] def build_input(self, qa_examples): - return out_qa_pairs, response_str
    - -
    [docs] def max_rouge_l_score(self, reference, candidates): + def format_qa_pairs(qa_example): + return ''.join([ + self.qa_pair_template.format(q, a) for q, a in qa_example + if q and a + ]) - r = rouge.Rouge() - max_score = 0.0 - for candidate in candidates: - scores = r.get_scores(candidate, reference) - rouge_l_score = scores[0]['rouge-l']['f'] - if rouge_l_score > max_score: - max_score = rouge_l_score - return max_score
    + formatted_examples = ''.join([ + self.example_template.format(qa_pairs=format_qa_pairs(qa_example)) + for qa_example in qa_examples + ]) + input_prompt = self.input_template.format(examples=formatted_examples) + return input_prompt
    + +
    [docs] def parse_output(self, raw_output): + logger.debug(raw_output) + output_qa_pairs = [] + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + for match in matches: + question, answer = match + output_qa_pairs.append((question.strip(), answer.strip())) + return output_qa_pairs
    -
    [docs] def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank=rank) +
    [docs] def process_single(self, sample=None, rank=None): + model, _ = get_model(self.model_key, rank, self.use_cuda()) random_qa_samples = random.sample(self.seed_qa_samples, - self.instruct_num) - input_prompt = self.build_prompt(random_qa_samples, - self.prompt_template) + self.example_num) + input_prompt = self.build_input(random_qa_samples) + + messages = [{ + 'role': 'system', + 'content': self.system_prompt + }, { + 'role': 'user', + 'content': input_prompt + }] + if self.enable_vllm: - response = model.generate([input_prompt], self.sampling_params) - response_str = response[0].outputs[0].text + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text else: - inputs = processor(input_prompt, - return_tensors='pt').to(model.device) - output_ids = model.generate(**inputs, **self.sampling_params) - # remove the input prompt from the output - output_ids = output_ids[:, inputs.data['input_ids'].shape[1]:] - response_str = processor.decode(output_ids.cpu()[0], - skip_special_tokens=True) - message_list = [] - out_qa_pairs, response_str = self.parse_response(response_str) - - if not response_str: - return {self.text_key: json.dumps({'messages': message_list})} + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] + + output_qa_pairs = self.parse_output(output) + if len(output_qa_pairs) == 0: + logger.warning('Parse model response error! ' + 'No data generated for the current response!') + sample.update({ + self.query_key: '', + self.response_key: '', + self.history_key: self.empty_history() + }) + return sample if self.similarity_type == 'rouge_l': - sim_score = self.max_rouge_l_score(response_str, - self.reference_samples) + sim_score = self._max_rouge_l_score(output_qa_pairs, + random_qa_samples) else: raise ValueError( f'Not support similarity type "{self.similarity_type}"!') if sim_score <= self.similarity_threshold: - for question, answer in out_qa_pairs: - message_list.append({'role': 'user', 'content': question}) - message_list.append({'role': 'assistant', 'content': answer}) + query, response = output_qa_pairs[-1] + history = output_qa_pairs[:-1] + if len(history) == 0: + history = self.empty_history() else: + query = response = '' + history = self.empty_history() logger.info('Filter this generated sample due to similarity.') - return { - self.text_key: - json.dumps({'messages': message_list}, ensure_ascii=False) - }
    + sample.update({ + self.query_key: query, + self.response_key: response, + self.history_key: history + }) + return sample
    diff --git a/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html new file mode 100644 index 000000000..02571066e --- /dev/null +++ b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html @@ -0,0 +1,255 @@ + + + + + + + + data_juicer.ops.mapper.generate_qa_from_text_mapper — data_juicer 0.2.0 documentation + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper

    +import re
    +from typing import Dict, Optional
    +
    +from loguru import logger
    +
    +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
    +from data_juicer.utils.lazy_loader import LazyLoader
    +from data_juicer.utils.model_utils import get_model, prepare_model
    +
    +torch = LazyLoader('torch', 'torch')
    +vllm = LazyLoader('vllm', 'vllm')
    +
    +OP_NAME = 'generate_qa_from_text_mapper'
    +
    +
    +# TODO: Extend LLM-based OPs into API-based implementation.
    +
    [docs]@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class GenerateQAFromTextMapper(Mapper): + """ + Mapper to generate question and answer pairs from text. + Recommended model list: [ + 'alibaba-pai/pai-llama3-8b-doc2qa', + 'alibaba-pai/pai-baichuan2-7b-doc2qa', + 'alibaba-pai/pai-qwen1_5-4b-doc2qa', + 'alibaba-pai/pai-qwen1_5-7b-doc2qa', + 'alibaba-pai/pai-qwen1_5-1b8-doc2qa', + 'alibaba-pai/pai-qwen1_5-0b5-doc2qa' + ] + These recommended models are all trained with Chinese data + and are suitable for Chinese. + """ + + _accelerator = 'cuda' + _batched_op = True + +
    [docs] def __init__(self, + hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', + *, + output_pattern: Optional[str] = None, + enable_vllm: bool = False, + model_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, + **kwargs): + """ + Initialization method. + + :param hf_model: Hugginface model ID. + :param output_pattern: Regular expression pattern to extract + questions and answers from model response. + :param enable_vllm: Whether to use vllm for inference acceleration. + :param model_params: Parameters for initializing the model. + :param sampling_params: Sampling parameters for text generation, + e.g {'temperature': 0.9, 'top_p': 0.95} + :param kwargs: Extra keyword arguments. + + The default data format parsed by this interface is as follows: + Model Input: + 蒙古国的首都是乌兰巴托(Ulaanbaatar) + 冰岛的首都是雷克雅未克(Reykjavik) + Model Output: + 蒙古国的首都是乌兰巴托(Ulaanbaatar) + 冰岛的首都是雷克雅未克(Reykjavik) + Human: 请问蒙古国的首都是哪里? + Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 + Human: 冰岛的首都是哪里呢? + Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 + ... + """ + + super().__init__(**kwargs) + + if output_pattern is None: + self.output_pattern = r'Human:(.*?)Assistant:(.*?)(?=Human|$)' # noqa: E501 + else: + self.output_pattern = output_pattern + + self.enable_vllm = enable_vllm + model_params = model_params or {} + sampling_params = sampling_params or {} + + if enable_vllm: + assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: + tensor_parallel_size = torch.cuda.device_count() + logger.info(f'Set tensor_parallel_size to \ + {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size + self.model_key = prepare_model( + model_type='vllm', + pretrained_model_name_or_path=hf_model, + **model_params) + self.sampling_params = vllm.SamplingParams(**sampling_params) + else: + self.model_key = prepare_model( + model_type='huggingface', + pretrained_model_name_or_path=hf_model, + return_pipe=True, + **model_params) + self.sampling_params = sampling_params
    + +
    [docs] def parse_output(self, raw_output): + logger.debug(raw_output) + qa_list = [] + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + for match in matches: + user, assistant = match + qa_list.append((user.strip(), assistant.strip())) + return qa_list
    + +
    [docs] def process_batched(self, samples, rank=None): + model, _ = get_model(self.model_key, rank, self.use_cuda()) + + input_keys = samples.keys() + num_samples = len(samples[next(iter(input_keys))]) + output_keys = input_keys | {self.query_key, self.response_key} + output_samples = {key: [] for key in output_keys} + + for i in range(num_samples): + messages = [{'role': 'user', 'content': samples[self.text_key][i]}] + + if self.enable_vllm: + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text + else: + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] + + qa_list = self.parse_output(output) + if len(qa_list) > 0: + for q, a in qa_list: + for input_k in input_keys: + output_samples[input_k].append(samples[input_k][i]) + output_samples[self.query_key].append(q) + output_samples[self.response_key].append(a) + else: + logger.warning( + 'No question and answer was extracted from current sample!' + ) + + return output_samples
    +
    + +
    +
    +
    + +
    + +
    +

    © Copyright 2024, Data-Juicer Team.

    +
    + + Built with Sphinx using a + theme + provided by Read the Docs. + + +
    +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html b/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html deleted file mode 100644 index 9404f2336..000000000 --- a/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html +++ /dev/null @@ -1,224 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.optimize_instruction_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    - -
    -
    -
    -
    - -

    Source code for data_juicer.ops.mapper.optimize_instruction_mapper

    -from typing import Dict, Optional
    -
    -from loguru import logger
    -
    -from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
    -from data_juicer.utils.lazy_loader import LazyLoader
    -from data_juicer.utils.model_utils import get_model, prepare_model
    -
    -torch = LazyLoader('torch', 'torch')
    -vllm = LazyLoader('vllm', 'vllm')
    -
    -DEFAULT_SYSTEM_PROMPT = '请优化这个指令,将其修改为一个更详细具体的指令。'
    -
    -OP_NAME = 'optimize_instruction_mapper'
    -
    -
    -# TODO: Extend LLM-based OPs into API-based implementation.
    -
    [docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class OptimizeInstructionMapper(Mapper): - """Mapper to optimize instruction. - Recommended model list: [ - alibaba-pai/Qwen2-1.5B-Instruct-Refine - alibaba-pai/Qwen2-7B-Instruct-Refine - ] - """ - _accelerator = 'cuda' - -
    [docs] def __init__(self, - hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine', - trust_remote_code: bool = False, - system_prompt: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - *args, - **kwargs): - """ - Initialization method. - :param hf_model: Hugginface model id. - :param trust_remote_code: passed to transformers - :param system_prompt: System prompt for optimize samples. - :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. - :param sampling_params: Sampling parameters for text generation. - e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.num_proc = 1 - - if system_prompt is None: - system_prompt = DEFAULT_SYSTEM_PROMPT - self.system_prompt = system_prompt - self.enable_vllm = enable_vllm - - if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') - self.model_key = prepare_model( - model_type='vllm', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) - self.sampling_params = vllm.SamplingParams(**sampling_params) - else: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) - self.sampling_params = sampling_params
    - -
    [docs] def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank=rank) - - messages = [{ - 'role': 'system', - 'content': self.system_prompt - }, { - 'role': 'user', - 'content': sample[self.text_key] - }] - input_prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True) - - if self.enable_vllm: - response = model.generate([input_prompt], self.sampling_params) - output = response[0].outputs[0].text - else: - inputs = processor(input_prompt, - return_tensors='pt').to(model.device) - response = model.generate(**inputs, - eos_token_id=processor.eos_token_id, - **self.sampling_params) - output = processor.decode(response.cpu()[0], - skip_special_tokens=True) - - sample[self.text_key] = output - - return sample
    -
    - -
    -
    -
    - -
    - -
    -

    © Copyright 2024, Data-Juicer Team.

    -
    - - Built with Sphinx using a - theme - provided by Read the Docs. - - -
    -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html new file mode 100644 index 000000000..a040d8438 --- /dev/null +++ b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html @@ -0,0 +1,254 @@ + + + + + + + + data_juicer.ops.mapper.optimize_qa_mapper — data_juicer 0.2.0 documentation + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Source code for data_juicer.ops.mapper.optimize_qa_mapper

    +import re
    +from typing import Dict, Optional
    +
    +from loguru import logger
    +
    +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
    +from data_juicer.utils.lazy_loader import LazyLoader
    +from data_juicer.utils.model_utils import get_model, prepare_model
    +
    +torch = LazyLoader('torch', 'torch')
    +vllm = LazyLoader('vllm', 'vllm')
    +
    +OP_NAME = 'optimize_qa_mapper'
    +
    +
    +# TODO: Extend LLM-based OPs into API-based implementation.
    +
    [docs]@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class OptimizeQAMapper(Mapper): + """ + Mapper to optimize question-answer pairs. + """ + + # avoid leading whitespace + DEFAULT_SYSTEM_PROMPT = ('请优化输入的问答对,使【问题】和【回答】都更加详细、准确。' + '必须按照以下标记格式,直接输出优化后的问答对:\n' + '【问题】\n' + '优化后的问题\n' + '【回答】\n' + '优化后的回答') + DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}' + DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}' + DEFAULT_OUTPUT_PATTERN = r'.*?【问题】\s*(.*?)\s*【回答】\s*(.*)' + + _accelerator = 'cuda' + +
    [docs] def __init__(self, + hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', + *, + system_prompt: Optional[str] = None, + input_template: Optional[str] = None, + qa_pair_template: Optional[str] = None, + output_pattern: Optional[str] = None, + enable_vllm: bool = False, + model_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, + **kwargs): + """ + Initialization method. + + :param hf_model: Hugging Face model ID. + :param system_prompt: System prompt for guiding the optimization task. + :param input_template: Template for building the input for the model. + Please make sure the template contains one placeholder '{}', which + corresponds to the question and answer pair generated by + param `qa_pair_template`. + :param qa_pair_template: Template for formatting the question and + answer pair. Please make sure the template contains two + '{}' to format question and answer. + :param output_pattern: Regular expression pattern to extract question + and answer from model response. + :param enable_vllm: Whether to use VLLM for inference acceleration. + :param model_params: Parameters for initializing the model. + :param sampling_params: Sampling parameters for text generation (e.g., + {'temperature': 0.9, 'top_p': 0.95}). + :param kwargs: Extra keyword arguments. + """ + super().__init__(**kwargs) + + self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT + self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE + self.qa_pair_template = qa_pair_template or \ + self.DEFAULT_QA_PAIR_TEMPLATE + self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN + + self.enable_vllm = enable_vllm + model_params = model_params or {} + sampling_params = sampling_params or {} + + if enable_vllm: + assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: + tensor_parallel_size = torch.cuda.device_count() + logger.info(f'Set tensor_parallel_size to \ + {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size + self.model_key = prepare_model( + model_type='vllm', + pretrained_model_name_or_path=hf_model, + **model_params) + self.sampling_params = vllm.SamplingParams(**sampling_params) + else: + self.model_key = prepare_model( + model_type='huggingface', + pretrained_model_name_or_path=hf_model, + return_pipe=True, + **model_params) + self.sampling_params = sampling_params
    + +
    [docs] def build_input(self, sample): + qa_pair = self.qa_pair_template.format(sample[self.query_key], + sample[self.response_key]) + input_prompt = self.input_template.format(qa_pair) + return input_prompt
    + +
    [docs] def parse_output(self, raw_output): + logger.debug(raw_output) + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + if matches: + match = matches[0] + return match.group(1).strip(), match.group(2).strip() + else: + return None, None
    + +
    [docs] def process_single(self, sample=None, rank=None): + model, _ = get_model(self.model_key, rank, self.use_cuda()) + + input_prompt = self.build_input(sample) + messages = [{ + 'role': 'system', + 'content': self.system_prompt + }, { + 'role': 'user', + 'content': input_prompt + }] + + if self.enable_vllm: + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text + else: + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] + + parsed_q, parsed_a = self.parse_output(output) + if parsed_q: + sample[self.query_key] = parsed_q + if parsed_a: + sample[self.response_key] = parsed_a + + return sample
    +
    + +
    +
    +
    + +
    + +
    +

    © Copyright 2024, Data-Juicer Team.

    +
    + + Built with Sphinx using a + theme + provided by Read the Docs. + + +
    +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_query_mapper.html b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html new file mode 100644 index 000000000..ab704fc5a --- /dev/null +++ b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html @@ -0,0 +1,129 @@ + + + + + + + + data_juicer.ops.mapper.optimize_query_mapper — data_juicer 0.2.0 documentation + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Source code for data_juicer.ops.mapper.optimize_query_mapper

    +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
    +from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
    +
    +OP_NAME = 'optimize_query_mapper'
    +
    +
    +# TODO: Extend LLM-based OPs into API-based implementation.
    +
    [docs]@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class OptimizeQueryMapper(OptimizeQAMapper): + """ + Mapper to optimize query in question-answer pairs. + """ + + DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。' # noqa: E501 + + _accelerator = 'cuda' + +
    [docs] def parse_output(self, raw_output): + return raw_output.strip(), None
    +
    + +
    +
    +
    + +
    + +
    +

    © Copyright 2024, Data-Juicer Team.

    +
    + + Built with Sphinx using a + theme + provided by Read the Docs. + + +
    +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_response_mapper.html b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html new file mode 100644 index 000000000..22993ce81 --- /dev/null +++ b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html @@ -0,0 +1,129 @@ + + + + + + + + data_juicer.ops.mapper.optimize_response_mapper — data_juicer 0.2.0 documentation + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Source code for data_juicer.ops.mapper.optimize_response_mapper

    +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
    +from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
    +
    +OP_NAME = 'optimize_response_mapper'
    +
    +
    +# TODO: Extend LLM-based OPs into API-based implementation.
    +
    [docs]@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class OptimizeResponseMapper(OptimizeQAMapper): + """ + Mapper to optimize response in question-answer pairs. + """ + + DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。' + + _accelerator = 'cuda' + +
    [docs] def parse_output(self, raw_output): + return None, raw_output.strip()
    +
    + +
    +
    +
    + +
    + +
    +

    © Copyright 2024, Data-Juicer Team.

    +
    + + Built with Sphinx using a + theme + provided by Read the Docs. + + +
    +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html index a553f9a40..7df789de5 100644 --- a/_modules/index.html +++ b/_modules/index.html @@ -159,9 +159,9 @@

    All modules for which code is available

  • data_juicer.ops.mapper.clean_ip_mapper
  • data_juicer.ops.mapper.clean_links_mapper
  • data_juicer.ops.mapper.expand_macro_mapper
  • -
  • data_juicer.ops.mapper.extract_qa_mapper
  • data_juicer.ops.mapper.fix_unicode_mapper
  • -
  • data_juicer.ops.mapper.generate_instruction_mapper
  • +
  • data_juicer.ops.mapper.generate_qa_from_examples_mapper
  • +
  • data_juicer.ops.mapper.generate_qa_from_text_mapper
  • data_juicer.ops.mapper.image_blur_mapper
  • data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper
  • data_juicer.ops.mapper.image_captioning_mapper
  • @@ -170,7 +170,9 @@

    All modules for which code is available

  • data_juicer.ops.mapper.image_tagging_mapper
  • data_juicer.ops.mapper.nlpaug_en_mapper
  • data_juicer.ops.mapper.nlpcda_zh_mapper
  • -
  • data_juicer.ops.mapper.optimize_instruction_mapper
  • +
  • data_juicer.ops.mapper.optimize_qa_mapper
  • +
  • data_juicer.ops.mapper.optimize_query_mapper
  • +
  • data_juicer.ops.mapper.optimize_response_mapper
  • data_juicer.ops.mapper.punctuation_normalization_mapper
  • data_juicer.ops.mapper.remove_bibliography_mapper
  • data_juicer.ops.mapper.remove_comments_mapper
  • diff --git a/data_juicer.ops.deduplicator.html b/data_juicer.ops.deduplicator.html index b3126bfce..34c7a68be 100644 --- a/data_juicer.ops.deduplicator.html +++ b/data_juicer.ops.deduplicator.html @@ -47,15 +47,15 @@
  • data_juicer.ops.filter
  • data_juicer.ops.mapper
  • data_juicer.ops.deduplicator
  • data_juicer.ops.selector
  • @@ -92,44 +92,45 @@

    data_juicer.ops.deduplicator

    -
    -class data_juicer.ops.deduplicator.VideoDeduplicator(consider_text: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.deduplicator.DocumentDeduplicator(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]

    Bases: Deduplicator

    -

    Deduplicator to deduplicate samples at document-level using exact matching -of videos between documents.

    +

    Deduplicator to deduplicate samples at document-level using exact matching.

    +

    Using md5 hash to deduplicate samples.

    -
    -__init__(consider_text: bool = False, *args, **kwargs)[source]
    -

    Initialization.

    +
    +__init__(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +

    Initialization method.

    Parameters:
      -
    • consider_text – whether to consider text hash together with video -hash when applying deduplication.

    • +
    • lowercase – Whether to convert sample text to lower case

    • +
    • ignore_non_character – Whether to ignore non-alphabet +characters, including whitespaces, digits, and punctuations

    • args – extra args

    • -
    • kwargs – extra args

    • +
    • kwargs – extra args.

    -
    -compute_hash(sample, context=False)[source]
    -

    Compute hash values for the sample.

    +
    +compute_hash(sample)[source]
    +

    Compute md5 hash values for the sample.

    Parameters:

    sample – input sample

    Returns:
    -

    sample with computed hash value.

    +

    sample with md5 hash value.

    -
    -process(dataset, show_num=0)[source]
    +
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -147,69 +148,6 @@
    -
    -
    -class data_juicer.ops.deduplicator.RayBasicDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    -

    Bases: Filter

    -

    A basic exact matching deduplicator for RAY. -Although its functionality is deduplication, -it is implemented as Filter sub-class.

    -
    -
    -EMPTY_HASH_VALUE = 'EMPTY'
    -
    - -
    -
    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    -

    Initialization. -:param redis_host: the hostname of redis server -:param redis_port: the port of redis server -:param args: extra args -:param kwargs: extra args

    -
    - -
    -
    -calculate_hash(sample, context=False)[source]
    -

    Calculate hash value for the sample.

    -
    - -
    -
    -compute_stats_single(sample, context=False)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    - -
    -
    -process_single(sample)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    - -
    -
    class data_juicer.ops.deduplicator.DocumentMinhashDeduplicator(tokenization: str = 'space', window_size: int[int] = 5, lowercase: bool = True, ignore_pattern: str | None = None, num_permutations: int[int] = 256, jaccard_threshold: float[float] = 0.7, num_bands: int[int] | None = None, num_rows_per_band: int[int] | None = None, tokenizer_model: str | None = None, *args, **kwargs)[source]
    @@ -291,95 +229,54 @@
    -
    -class data_juicer.ops.deduplicator.RayImageDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    -

    Bases: RayBasicDeduplicator

    -

    Deduplicator to deduplicate samples at document-level using exact matching -of images between documents.

    -
    -
    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    -

    Initialization. -:param redis_host: the hostname of redis server -:param redis_port: the port of redis server -:param args: extra args -:param kwargs: extra args

    -
    - -
    -
    -calculate_hash(sample, context=False)[source]
    -

    Calculate hash value for the sample.

    -
    - -
    - -
    -
    -class data_juicer.ops.deduplicator.RayDocumentDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    -

    Bases: RayBasicDeduplicator

    -

    Deduplicator to deduplicate samples at document-level using exact matching.

    -
    -
    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    -

    Initialization method. -:param redis_host: the hostname of redis server -:param redis_port: the port of redis server -:param lowercase: Whether to convert sample text to lower case -:param ignore_non_character: Whether to ignore non-alphabet -characters, including whitespaces, digits, and punctuations -:param args: extra args -:param kwargs: extra args.

    -
    - -
    -
    -calculate_hash(sample, context=False)[source]
    -

    Calculate hash value for the sample.

    -
    - -
    - -
    -
    -class data_juicer.ops.deduplicator.DocumentDeduplicator(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.deduplicator.DocumentSimhashDeduplicator(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]

    Bases: Deduplicator

    -

    Deduplicator to deduplicate samples at document-level using exact matching.

    -

    Using md5 hash to deduplicate samples.

    +

    Deduplicator to deduplicate samples at document-level using SimHash.

    -
    -__init__(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    -

    Initialization method.

    +
    +__init__(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]
    +

    Initialization method :param tokenization: tokenization method for +sample texts.

    +

    It should be one of [space, punctuation, character]. For +English-like languages, we recommend to use ‘space’. And for +Chinese-like languages, we recommend to use ‘character’

    Parameters:
      -
    • lowercase – Whether to convert sample text to lower case

    • -
    • ignore_non_character – Whether to ignore non-alphabet -characters, including whitespaces, digits, and punctuations

    • -
    • args – extra args

    • -
    • kwargs – extra args.

    • +
    • window_size – window size of shingling

    • +
    • lowercase – whether to convert text to lower case first

    • +
    • ignore_pattern – whether to ignore sub-strings with +specific pattern when computing simhash

    • +
    • num_blocks – number of blocks in simhash computing

    • +
    • hamming_distance – the max hamming distance threshold in +near-duplicate detection. When the hamming distance of two +sample texts is <= this threshold, they are regarded as +similar samples and this op will only keep one of them after +deduplication. This threshold should be always less than +num_blocks

    -
    -compute_hash(sample)[source]
    -

    Compute md5 hash values for the sample.

    +
    +compute_hash(sample)[source]
    +

    Compute simhash values for the sample.

    Parameters:

    sample – input sample

    Returns:
    -

    sample with md5 hash value.

    +

    sample with simhash value.

    -
    -process(dataset, show_num=0)[source]
    +
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -455,71 +352,118 @@
    -
    -class data_juicer.ops.deduplicator.DocumentSimhashDeduplicator(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]
    -

    Bases: Deduplicator

    -

    Deduplicator to deduplicate samples at document-level using SimHash.

    +
    +class data_juicer.ops.deduplicator.RayBasicDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    +

    Bases: Filter

    +

    A basic exact matching deduplicator for RAY. +Although its functionality is deduplication, +it is implemented as Filter sub-class.

    +
    +
    +EMPTY_HASH_VALUE = 'EMPTY'
    +
    +
    -
    -__init__(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]
    -

    Initialization method :param tokenization: tokenization method for -sample texts.

    -

    It should be one of [space, punctuation, character]. For -English-like languages, we recommend to use ‘space’. And for -Chinese-like languages, we recommend to use ‘character’

    -
    -
    Parameters:
    -
      -
    • window_size – window size of shingling

    • -
    • lowercase – whether to convert text to lower case first

    • -
    • ignore_pattern – whether to ignore sub-strings with -specific pattern when computing simhash

    • -
    • num_blocks – number of blocks in simhash computing

    • -
    • hamming_distance – the max hamming distance threshold in -near-duplicate detection. When the hamming distance of two -sample texts is <= this threshold, they are regarded as -similar samples and this op will only keep one of them after -deduplication. This threshold should be always less than -num_blocks

    • -
    -
    -
    +
    +__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    +

    Initialization. +:param redis_host: the hostname of redis server +:param redis_port: the port of redis server +:param args: extra args +:param kwargs: extra args

    -
    -compute_hash(sample)[source]
    -

    Compute simhash values for the sample.

    +
    +calculate_hash(sample, context=False)[source]
    +

    Calculate hash value for the sample.

    +
    + +
    +
    +compute_stats_single(sample, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    Parameters:
    -

    sample – input sample

    +
      +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    • +
    Returns:
    -

    sample with simhash value.

    +

    sample with computed stats

    -
    -process(dataset, show_num=0)[source]
    -

    For doc-level, dataset –> dataset.

    +
    +process_single(sample)[source]
    +

    For sample level, sample –> Boolean.

    Parameters:
    -
      -
    • dataset – input dataset

    • -
    • show_num – number of traced samples used when tracer is -open.

    • -
    +

    sample – sample to decide whether to filter

    Returns:
    -

    deduplicated dataset and the sampled duplicate pairs.

    +

    true for keeping and false for filtering

    +
    +
    +class data_juicer.ops.deduplicator.RayDocumentDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +

    Bases: RayBasicDeduplicator

    +

    Deduplicator to deduplicate samples at document-level using exact matching.

    +
    +
    +__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +

    Initialization method. +:param redis_host: the hostname of redis server +:param redis_port: the port of redis server +:param lowercase: Whether to convert sample text to lower case +:param ignore_non_character: Whether to ignore non-alphabet +characters, including whitespaces, digits, and punctuations +:param args: extra args +:param kwargs: extra args.

    +
    + +
    +
    +calculate_hash(sample, context=False)[source]
    +

    Calculate hash value for the sample.

    +
    + +
    + +
    +
    +class data_juicer.ops.deduplicator.RayImageDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    +

    Bases: RayBasicDeduplicator

    +

    Deduplicator to deduplicate samples at document-level using exact matching +of images between documents.

    +
    +
    +__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    +

    Initialization. +:param redis_host: the hostname of redis server +:param redis_port: the port of redis server +:param args: extra args +:param kwargs: extra args

    +
    + +
    +
    +calculate_hash(sample, context=False)[source]
    +

    Calculate hash value for the sample.

    +
    + +
    +
    class data_juicer.ops.deduplicator.RayVideoDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    @@ -544,6 +488,62 @@
    +
    +
    +class data_juicer.ops.deduplicator.VideoDeduplicator(consider_text: bool = False, *args, **kwargs)[source]
    +

    Bases: Deduplicator

    +

    Deduplicator to deduplicate samples at document-level using exact matching +of videos between documents.

    +
    +
    +__init__(consider_text: bool = False, *args, **kwargs)[source]
    +

    Initialization.

    +
    +
    Parameters:
    +
      +
    • consider_text – whether to consider text hash together with video +hash when applying deduplication.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +compute_hash(sample, context=False)[source]
    +

    Compute hash values for the sample.

    +
    +
    Parameters:
    +

    sample – input sample

    +
    +
    Returns:
    +

    sample with computed hash value.

    +
    +
    +
    + +
    +
    +process(dataset, show_num=0)[source]
    +

    For doc-level, dataset –> dataset.

    +
    +
    Parameters:
    +
      +
    • dataset – input dataset

    • +
    • show_num – number of traced samples used when tracer is +open.

    • +
    +
    +
    Returns:
    +

    deduplicated dataset and the sampled duplicate pairs.

    +
    +
    +
    + +
    +
    diff --git a/data_juicer.ops.filter.html b/data_juicer.ops.filter.html index 81afd97db..b9c1f1dd0 100644 --- a/data_juicer.ops.filter.html +++ b/data_juicer.ops.filter.html @@ -45,49 +45,49 @@
  • data_juicer.core
  • data_juicer.ops
  • data_juicer.ops.filter
  • data_juicer.ops.mapper
  • @@ -126,33 +126,28 @@

    data_juicer.ops.filter

    -
    -class data_juicer.ops.filter.ImageTextSimilarityFilter(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AlphanumericFilter(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples those similarities between image and text -within a specific range.

    +

    Filter to keep samples with alphabet/numeric ratio within a specific +range.

    -
    -__init__(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +__init__(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_clip – clip model name on huggingface to compute -the similarity between image and text.

    • -
    • min_score – The min similarity to keep samples.

    • -
    • max_score – The max similarity to keep samples.

    • -
    • horizontal_flip – Flip image horizontally (left to right).

    • -
    • vertical_flip – Flip image vertically (top to bottom).

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    • -
    • reduce_mode – reduce mode when one text corresponds to -multiple images in a chunk. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • +
    • tokenization – Whether to count the ratio of alphanumeric +to the total number of tokens. if tokenization=False, it +will count the ratio of alphanumeric to the total number of +characters.

    • +
    • min_ratio – The min filter ratio in alphanumeric op, +samples will be filtered if their alphabet/numeric ratio is +below this parameter.

    • +
    • max_ratio – The max filter ratio in alphanumeric op, +samples will be filtered if their alphabet/numeric ratio +exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    @@ -161,60 +156,36 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    +
    +compute_stats_batched(samples)[source]
    +
    -
    -process_single(sample, rank=None)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    +
    +process_batched(samples)[source]
    +
    -
    -class data_juicer.ops.filter.VideoAspectRatioFilter(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AudioDurationFilter(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with video aspect ratio within a specific range. -AspectRatio = W / H.

    +

    Keep data samples whose audios’ durations are within a specified range.

    -
    -__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_ratio – The minimum aspect ratio to keep samples, -supported format is a string, such as “9:21” or “9/21”.

    • -
    • max_ratio – The maximum aspect ratio to keep samples, -supported format is a string, such as “21:9” or “21/9”.

    • +
    • min_duration – The min audio duration to keep samples in seconds. +It’s 0 by default.

    • +
    • max_duration – The max audio duration to keep samples in seconds. +It’s sys.maxsize by default.

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the +all audios. ‘any’: keep this sample if any audios meet the +condition. ‘all’: keep this sample only if all audios meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -224,8 +195,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -243,8 +214,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -259,33 +230,28 @@
    -
    -class data_juicer.ops.filter.ImageTextMatchingFilter(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AudioNMFSNRFilter(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples those matching score between image and text -within a specific range.

    +

    Keep data samples whose audios’ SNRs (computed based on NMF) are within +a specified range.

    -
    -__init__(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +__init__(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_blip – blip model name on huggingface to compute -the matching score between image and text.

    • -
    • min_score – The min matching score to keep samples.

    • -
    • max_score – The max matching score to keep samples.

    • -
    • horizontal_flip – Flip image horizontally (left to right).

    • -
    • vertical_flip – Flip image vertically (top to bottom).

    • +
    • min_snr – The min audio SNR to keep samples in dB. It’s 0 by +default.

    • +
    • max_snr – The max audio SNR to keep samples in dB. It’s +sys.maxsize by default.

    • +
    • nmf_iter_num – The max number of iterations to run NMF. It’s 500 +in default.

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +all audios. ‘any’: keep this sample if any audios meet the +condition. ‘all’: keep this sample only if all audios meet the condition.

    • -
    • reduce_mode – reduce mode when one text corresponds to -multiple images in a chunk. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • args – extra args

    • kwargs – extra args

    @@ -294,8 +260,8 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -313,8 +279,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -329,24 +295,25 @@
    -
    -class data_juicer.ops.filter.ImageNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AudioSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose images have low nsfw scores.

    +

    Keep data samples whose audio size (in bytes/kb/MB/…) within a +specific range.

    -
    -__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_nsfw_model – nsfw detection model name on huggingface.

    • -
    • score_threshold – the nsfw score threshold for samples. -range from 0 to 1. Samples with nsfw score less than this threshold -will be kept.

    • +
    • min_size – The min audio size to keep samples. set to be “0” by +default for no size constraint

    • +
    • max_size – The max audio size to keep samples. set to be +“1Tb” by default, an approximate for un-limited case

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +all audios. ‘any’: keep this sample if any audios meet the +condition. ‘all’: keep this sample only if all audios meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -356,8 +323,8 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -375,8 +342,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -391,24 +358,23 @@
    -
    -class data_juicer.ops.filter.TokenNumFilter(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AverageLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with total token number within a specific +

    Filter to keep samples with average line length within a specific range.

    -
    -__init__(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_tokenizer – the tokenizer name of Hugging Face tokenizers.

    • -
    • min_num – The min filter token number in this op, samples -will be filtered if their token number is below this +

    • min_len – The min filter length in this op, samples will +be filtered if their average line length is below this parameter.

    • -
    • max_num – The max filter token number in this op, samples -will be filtered if their token number exceeds this +

    • max_len – The max filter length in this op, samples will +be filtered if their average line length exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    • @@ -418,59 +384,37 @@
    -
    -compute_stats_single(sample)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    +
    +compute_stats_batched(samples, context=False)[source]
    +
    -
    -process_single(sample)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    +
    +process_batched(samples)[source]
    +
    -
    -class data_juicer.ops.filter.TextLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.CharacterRepetitionFilter(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with total text length within a specific -range.

    +

    Filter to keep samples with char-level n-gram repetition ratio within a +specific range.

    -
    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_len – The min text length in the filtering. samples -will be filtered if their text length is below this -parameter.

    • -
    • max_len – The max text length in the filtering. samples -will be filtered if their text length exceeds this -parameter.

    • +
    • rep_len – Repetition length for char-level n-gram.

    • +
    • min_ratio – The min filter ratio in this op, samples will +be filtered if their char-level n-gram repetition ratio is +below this parameter.

    • +
    • max_ratio – The max filter ratio in this op, samples will +be filtered if their char-level n-gram repetition ratio +exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    @@ -479,41 +423,43 @@
    -
    -compute_stats_batched(samples)[source]
    +
    +compute_stats_batched(samples)[source]
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.filter.SpecifiedNumericFieldFilter(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.FlaggedWordFilter(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter based on specified numeric field information.

    -

    If the specified numeric information in the sample is not within the -specified range, the sample will be filtered.

    +

    Filter to keep samples with flagged-word ratio less than a specific max +value.

    -
    -__init__(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • field_key – Filter based on the specified numeric value -corresponding to the target key. The target key -corresponding to multi-level field information need to be -separated by ‘.’.

    • -
    • min_value – The min filter value in SpecifiedNumericField -op, samples will be filtered if their specified numeric -field value is below this parameter.

    • -
    • max_value – The max filter value in SpecifiedNumericField -op, samples will be filtered if their specified numeric -field value exceeds this parameter.

    • +
    • lang – Consider flagged words in what language. If lang == +“all”, we will adopt the one merged from all the available +languages

    • +
    • tokenization – Whether to use model to tokenize documents

    • +
    • max_ratio – The max filter ratio in this op.

    • +
    • flagged_words_dir – The directory storing the +flagged_words file(s) whose name includes “flagged_words” +and in json format

    • +
    • use_words_aug – Whether to augment words, especially for +Chinese and Vietnamese

    • +
    • words_aug_group_sizes – The group size of words to augment

    • +
    • words_aug_join_char – The join char between words to +augment

    • args – extra args

    • kwargs – extra args

    @@ -522,8 +468,8 @@
    -
    -compute_stats_single(sample)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -541,8 +487,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -557,38 +503,37 @@
    -
    -class data_juicer.ops.filter.AudioNMFSNRFilter(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose audios’ SNRs (computed based on NMF) are within -a specified range.

    +

    Filter to keep samples with aesthetics scores within a specific range.

    -
    -__init__(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_snr – The min audio SNR to keep samples in dB. It’s 0 by -default.

    • -
    • max_snr – The max audio SNR to keep samples in dB. It’s -sys.maxsize by default.

    • -
    • nmf_iter_num – The max number of iterations to run NMF. It’s 500 -in default.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all audios. ‘any’: keep this sample if any audios meet the -condition. ‘all’: keep this sample only if all audios meet the +

    • hf_scorer_model – Huggingface model name for the aesthetics +predictor. By default, we will use +‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’, +refer to pypi.org/project/simple-aesthetics-predictor

    • +
    • min_score – Min score for the predicted aesthetics in an image.

    • +
    • max_score – Max score for the predicted aesthetics in an image.

    • +
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • args – Extra positional arguments.

    • +
    • kwargs – Extra keyword arguments.

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -606,8 +551,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -622,57 +567,34 @@
    -
    -class data_juicer.ops.filter.VideoAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageAspectRatioFilter(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep data samples with aesthetics scores for specified frames -in the videos within a specific range.

    +

    Filter to keep samples with image aspect ratio within a specific range. +AspectRatio = W / H.

    -
    -__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +__init__(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_scorer_model – Huggingface model name for the aesthetics -predictor. By default, we will use -‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’, -refer to pypi.org/project/simple-aesthetics-predictor

    • -
    • min_score – Min score for the predicted aesthetics in a video.

    • -
    • max_score – Max score for the predicted aesthetics in a video.

    • -
    • frame_sampling_method – sampling method of extracting frame -images from the videos. -Should be one of [“all_keyframes”, “uniform”]. -The former one extracts all key frames and the latter one extract -specified number of frames uniformly from the video. -Default: “uniform” with frame_num=3, considering that the number of -keyframes can be large while their difference is usually small -in terms of their aesthetics.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

    • -
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the +

    • min_ratio – The min aspect ratio to keep samples.

    • +
    • max_ratio – The max aspect ratio to keep samples.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • -
    • reduce_mode – reduce mode when one sample corresponds to -multiple frames, must be one of [‘avg’,’max’, ‘min’]. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • -
    • args – Extra positional arguments.

    • -
    • kwargs – Extra keyword arguments.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -690,8 +612,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -706,89 +628,97 @@
    -
    -class data_juicer.ops.filter.PerplexityFilter(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageFaceCountFilter(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with perplexity score less than a specific max -value.

    +

    Filter to keep samples with the number of faces within a specific range.

    -
    -__init__(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +
    +__init__(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – Compute perplexity for samples in which language.

    • -
    • max_ppl – The max filter perplexity in this op, samples -will be filtered if their perplexity exceeds this parameter.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • cv_classifier – OpenCV classifier path for face detection. +By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • +
    • min_face_count – Minimum number of faces required for samples.

    • +
    • max_face_count – Maximum number of faces required for samples.

    • +
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • +
    • args – Extra positional arguments.

    • +
    • kwargs – Extra keyword arguments.

    -
    -compute_stats_batched(samples, context=False)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    - -
    - -
    -
    -class data_juicer.ops.filter.PhraseGroundingRecallFilter(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +
    +compute_stats_single(sample, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    +
    +
    Parameters:
    +
      +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    • +
    +
    +
    Returns:
    +

    sample with computed stats

    +
    +
    +
    + +
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> Boolean.

    +
    +
    Parameters:
    +

    sample – sample to decide whether to filter

    +
    +
    Returns:
    +

    true for keeping and false for filtering

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.filter.ImageFaceRatioFilter(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose locating recalls of phrases extracted -from text in the images are within a specified range.

    +

    Filter to keep samples with face area ratios within a specific range.

    -
    -__init__(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +
    +__init__(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_owlvit – Owl-ViT model name on huggingface to locate the -phrases extracted from the text.

    • -
    • min_recall – The min phrase grounding recall to keep samples.

    • -
    • max_recall – The max phrase grounding recall to keep samples.

    • -
    • horizontal_flip – Flip image horizontally (left to right).

    • -
    • vertical_flip – Flip image vertically (top to bottom).

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +

    • cv_classifier – OpenCV classifier path for face detection. +By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • +
    • min_ratio – Min ratio for the largest face area in an image.

    • +
    • max_ratio – Max ratio for the largest face area in an image.

    • +
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of all images. ‘any’: keep this sample if any images meet the condition. ‘all’: keep this sample only if all images meet the condition.

    • -
    • reduce_mode – reduce mode when one text corresponds to -multiple images in a chunk. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • -
    • iou_thr – the IoU threshold for NMS-like post-process. If two -predicted bboxes are overlap with an IoU larger than this -threshold, the bbox with less confidence will be removed. Default: -0.5.

    • -
    • large_area_ratio_thr – the area ratio threshold for filtering out -those large predicted bboxes. If the area of a predicted bbox -accounts for more than this ratio threshold of the whole image -area, this bbox will be removed. Default: 0.95.

    • -
    • conf_thr – the confidence score threshold for removing -low-confidence bboxes. If the confidence score of a predicted bbox -is lower than the threshold, this bbox will be removed. Default: 0.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • args – Extra positional arguments.

    • +
    • kwargs – Extra keyword arguments.

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -806,8 +736,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -822,24 +752,25 @@
    -
    -class data_juicer.ops.filter.MaximumLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with maximum line length within a specific -range.

    +

    Filter to keep samples whose images have low nsfw scores.

    -
    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_len – The min filter length in this op, samples will -be filtered if their maximum line length is below this -parameter.

    • -
    • max_len – The max filter length in this op, samples will -be filtered if their maximum line length exceeds this -parameter.

    • +
    • hf_nsfw_model – nsfw detection model name on huggingface.

    • +
    • score_threshold – the nsfw score threshold for samples. +range from 0 to 1. Samples with nsfw score less than this threshold +will be kept.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -848,75 +779,133 @@
    -
    -compute_stats_batched(samples, context=False)[source]
    -
    +
    +compute_stats_single(sample, rank=None, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    +
    +
    Parameters:
    +
      +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    • +
    +
    +
    Returns:
    +

    sample with computed stats

    +
    +
    +
    -
    -process_batched(samples)[source]
    -
    +
    +process_single(sample, rank=None)[source]
    +

    For sample level, sample –> Boolean.

    +
    +
    Parameters:
    +

    sample – sample to decide whether to filter

    +
    +
    Returns:
    +

    true for keeping and false for filtering

    +
    +
    +
    -
    -class data_juicer.ops.filter.AverageLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImagePairSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with average line length within a specific -range.

    +

    Filter to keep image pairs with similarities between images +within a specific range.

    -
    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    +
    +
    +
    param hf_clip:
    +

    clip model name on huggingface to compute +the similarity between image and text.

    +
    +
    param min_score:
    +

    The min similarity to keep samples.

    +
    +
    param max_score:
    +

    The max similarity to keep samples.

    +
    +
    param any_or_all:
    +

    keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    +
    +
    param args:
    +

    extra args

    +
    +
    param kwargs:
    +

    extra args

    +
    +
    +
    +
    + +
    +
    +compute_stats_single(sample, rank=None, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    Parameters:
      -
    • min_len – The min filter length in this op, samples will -be filtered if their average line length is below this -parameter.

    • -
    • max_len – The max filter length in this op, samples will -be filtered if their average line length exceeds this -parameter.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    +
    Returns:
    +

    sample with computed stats

    +
    -
    -compute_stats_batched(samples, context=False)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    +
    +process_single(sample, rank=None)[source]
    +

    For sample level, sample –> Boolean.

    +
    +
    Parameters:
    +

    sample – sample to decide whether to filter

    +
    +
    Returns:
    +

    true for keeping and false for filtering

    +
    +
    +
    -
    -class data_juicer.ops.filter.SpecifiedFieldFilter(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageShapeFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter based on specified field information.

    -

    If the specified field information in the sample is not within the -specified target value, the sample will be filtered.

    +

    Filter to keep samples with image shape (w, h) within specific ranges.

    -
    -__init__(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +
    +__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • field_key – Filter based on the specified value -corresponding to the target key. The target key -corresponding to multi-level field information need to be -separated by ‘.’.

    • -
    • target_value – The range of specified field information -corresponding to the samples that need to be retained.

    • +
    • min_width – The min width to keep samples.

    • +
    • max_width – The max width to keep samples.

    • +
    • min_height – The min height to keep samples.

    • +
    • max_height – The max height to keep samples.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -925,8 +914,8 @@
    -
    -compute_stats_single(sample)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -944,8 +933,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -960,40 +949,25 @@
    -
    -class data_juicer.ops.filter.VideoTaggingFromFramesFilter(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose videos contain the given tags.

    +

    Keep data samples whose image size (in Bytes/KB/MB/…) within a +specific range.

    -
    -__init__(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • tags – a tag list to shift the videos, total tags can be found -in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt # noqa: E501

    • -
    • contain – require the videos containing ‘any’ or ‘all’ tags. -When tags equal to [], ‘all’ keeps all samples, ‘any’ keeps no -sample.

    • -
    • frame_sampling_method – sampling method of extracting frame -images from the videos. Should be one of -[“all_keyframes”, “uniform”]. -The former one extracts all key frames (the number of which depends -on the duration of the video) and the latter one extract specified -number of frames uniformly from the video. -Default: “all_keyframes”.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

    • -
    • tag_field_name – the field name to store the tags. It’s -“__dj__video_frame_tags__” in default.

    • +
    • min_size – The min image size to keep samples. set to be “0” by +default for no size constraint

    • +
    • max_size – The max image size to keep samples. set to be +“1TB” by default, an approximate for un-limited case

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -1003,8 +977,8 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1022,8 +996,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1038,34 +1012,43 @@
    -
    -class data_juicer.ops.filter.TextEntityDependencyFilter(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageTextMatchingFilter(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    -

    Identify the entities in the text which are independent with other token, -and filter them. The text containing no entities will be omitted.

    +

    Filter to keep samples those matching score between image and text +within a specific range.

    -
    -__init__(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]
    +
    +__init__(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – language of the text in the samples. ‘en’ for detection of -entities in English and ‘zh’ for detection of entities in Chinese.

    • -
    • mini_dependency_num – The min token number in the filtering. -Objects is independent if their number of edges in the dependency -tree is below this parameter.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy. -‘any’: keep this sample if any objet is dependent. ‘all’: keep this -sample only if all images are dependent.

    • +
    • hf_blip – blip model name on huggingface to compute +the matching score between image and text.

    • +
    • min_score – The min matching score to keep samples.

    • +
    • max_score – The max matching score to keep samples.

    • +
    • horizontal_flip – Flip image horizontally (left to right).

    • +
    • vertical_flip – Flip image vertically (top to bottom).

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • +
    • reduce_mode – reduce mode when one text corresponds to +multiple images in a chunk. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1083,8 +1066,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1099,25 +1082,33 @@
    -
    -class data_juicer.ops.filter.VideoResolutionFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageTextSimilarityFilter(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose videos’ resolutions are within a specified range.

    +

    Filter to keep samples those similarities between image and text +within a specific range.

    -
    -__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_width – The min horizontal resolution.

    • -
    • max_width – The max horizontal resolution.

    • -
    • min_height – The min vertical resolution.

    • -
    • max_height – The max vertical resolution.

    • +
    • hf_clip – clip model name on huggingface to compute +the similarity between image and text.

    • +
    • min_score – The min similarity to keep samples.

    • +
    • max_score – The max similarity to keep samples.

    • +
    • horizontal_flip – Flip image horizontally (left to right).

    • +
    • vertical_flip – Flip image vertically (top to bottom).

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • +
    • reduce_mode – reduce mode when one text corresponds to +multiple images in a chunk. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • args – extra args

    • kwargs – extra args

    @@ -1126,8 +1117,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1145,8 +1136,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1160,48 +1151,6 @@
    -
    -
    -class data_juicer.ops.filter.AlphanumericFilter(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]
    -

    Bases: Filter

    -

    Filter to keep samples with alphabet/numeric ratio within a specific -range.

    -
    -
    -__init__(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • tokenization – Whether to count the ratio of alphanumeric -to the total number of tokens. if tokenization=False, it -will count the ratio of alphanumeric to the total number of -characters.

    • -
    • min_ratio – The min filter ratio in alphanumeric op, -samples will be filtered if their alphabet/numeric ratio is -below this parameter.

    • -
    • max_ratio – The max filter ratio in alphanumeric op, -samples will be filtered if their alphabet/numeric ratio -exceeds this parameter.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -compute_stats_batched(samples)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    - -
    -
    class data_juicer.ops.filter.ImageWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, any_or_all: str = 'any', *args, **kwargs)[source]
    @@ -1267,37 +1216,31 @@
    -
    -class data_juicer.ops.filter.ImageAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.LanguageIDScoreFilter(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with aesthetics scores within a specific range.

    +

    Filter to keep samples in a specific language with confidence score +larger than a specific min value.

    -
    -__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_scorer_model – Huggingface model name for the aesthetics -predictor. By default, we will use -‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’, -refer to pypi.org/project/simple-aesthetics-predictor

    • -
    • min_score – Min score for the predicted aesthetics in an image.

    • -
    • max_score – Max score for the predicted aesthetics in an image.

    • -
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    • -
    • args – Extra positional arguments.

    • -
    • kwargs – Extra keyword arguments.

    • +
    • lang – Samples in which languages to keep.

    • +
    • min_score – The min language identification confidence +scores of samples to keep.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1315,8 +1258,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1331,26 +1274,117 @@
    -
    -class data_juicer.ops.filter.AudioSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.MaximumLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose audio size (in bytes/kb/MB/…) within a -specific range.

    +

    Filter to keep samples with maximum line length within a specific +range.

    -
    -__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_size – The min audio size to keep samples. set to be “0” by -default for no size constraint

    • -
    • max_size – The max audio size to keep samples. set to be -“1Tb” by default, an approximate for un-limited case

    • +
    • min_len – The min filter length in this op, samples will +be filtered if their maximum line length is below this +parameter.

    • +
    • max_len – The max filter length in this op, samples will +be filtered if their maximum line length exceeds this +parameter.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +compute_stats_batched(samples, context=False)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    +
    + +
    + +
    +
    +class data_juicer.ops.filter.PerplexityFilter(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +

    Bases: Filter

    +

    Filter to keep samples with perplexity score less than a specific max +value.

    +
    +
    +__init__(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • lang – Compute perplexity for samples in which language.

    • +
    • max_ppl – The max filter perplexity in this op, samples +will be filtered if their perplexity exceeds this parameter.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +compute_stats_batched(samples, context=False)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    +
    + +
    + +
    +
    +class data_juicer.ops.filter.PhraseGroundingRecallFilter(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +

    Bases: Filter

    +

    Filter to keep samples whose locating recalls of phrases extracted +from text in the images are within a specified range.

    +
    +
    +__init__(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • hf_owlvit – Owl-ViT model name on huggingface to locate the +phrases extracted from the text.

    • +
    • min_recall – The min phrase grounding recall to keep samples.

    • +
    • max_recall – The max phrase grounding recall to keep samples.

    • +
    • horizontal_flip – Flip image horizontally (left to right).

    • +
    • vertical_flip – Flip image vertically (top to bottom).

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all audios. ‘any’: keep this sample if any audios meet the -condition. ‘all’: keep this sample only if all audios meet the +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • +
    • reduce_mode – reduce mode when one text corresponds to +multiple images in a chunk. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • +
    • iou_thr – the IoU threshold for NMS-like post-process. If two +predicted bboxes are overlap with an IoU larger than this +threshold, the bbox with less confidence will be removed. Default: +0.5.

    • +
    • large_area_ratio_thr – the area ratio threshold for filtering out +those large predicted bboxes. If the area of a predicted bbox +accounts for more than this ratio threshold of the whole image +area, this bbox will be removed. Default: 0.95.

    • +
    • conf_thr – the confidence score threshold for removing +low-confidence bboxes. If the confidence score of a predicted bbox +is lower than the threshold, this bbox will be removed. Default: 0.

    • args – extra args

    • kwargs – extra args

    @@ -1359,8 +1393,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1378,8 +1412,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1394,30 +1428,24 @@
    -
    -class data_juicer.ops.filter.StopWordsFilter(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.SpecialCharactersFilter(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with stopword ratio larger than a specific min -value.

    +

    Filter to keep samples with special-char ratio within a specific +range.

    -
    -__init__(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +
    +__init__(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – Consider stopwords in what language. If lang == -“all”, we will adopt the one merged from all the available -languages

    • -
    • tokenization – whether to use model to tokenize documents

    • -
    • min_ratio – The min filter ratio in this op.

    • -
    • stopwords_dir – The directory storing the stopwords -file(s) whose name includes “stopwords” and in json format

    • -
    • use_words_aug – Whether to augment words, especially for -Chinese and Vietnamese

    • -
    • words_aug_group_sizes – The group size of words to augment

    • -
    • words_aug_join_char – The join char between words to -augment

    • +
    • min_ratio – The min filter ratio in this op, samples will +be filtered if their special-char ratio is below this +parameter.

    • +
    • max_ratio – The max filter ratio in this op, samples will +be filtered if their special-char ratio exceeds this +parameter.

    • args – extra args

    • kwargs – extra args

    @@ -1426,8 +1454,47 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_batched(samples)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    +
    + +
    + +
    +
    +class data_juicer.ops.filter.SpecifiedFieldFilter(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +

    Bases: Filter

    +

    Filter based on specified field information.

    +

    If the specified field information in the sample is not within the +specified target value, the sample will be filtered.

    +
    +
    +__init__(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • field_key – Filter based on the specified value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

    • +
    • target_value – The range of specified field information +corresponding to the samples that need to be retained.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1445,8 +1512,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1461,64 +1528,29 @@
    -
    -class data_juicer.ops.filter.CharacterRepetitionFilter(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]
    -

    Bases: Filter

    -

    Filter to keep samples with char-level n-gram repetition ratio within a -specific range.

    -
    -
    -__init__(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • rep_len – Repetition length for char-level n-gram.

    • -
    • min_ratio – The min filter ratio in this op, samples will -be filtered if their char-level n-gram repetition ratio is -below this parameter.

    • -
    • max_ratio – The max filter ratio in this op, samples will -be filtered if their char-level n-gram repetition ratio -exceeds this parameter.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -compute_stats_batched(samples)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    - -
    - -
    -
    -class data_juicer.ops.filter.ImageShapeFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.SpecifiedNumericFieldFilter(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with image shape (w, h) within specific ranges.

    +

    Filter based on specified numeric field information.

    +

    If the specified numeric information in the sample is not within the +specified range, the sample will be filtered.

    -
    -__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_width – The min width to keep samples.

    • -
    • max_width – The max width to keep samples.

    • -
    • min_height – The min height to keep samples.

    • -
    • max_height – The max height to keep samples.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    • +
    • field_key – Filter based on the specified numeric value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

    • +
    • min_value – The min filter value in SpecifiedNumericField +op, samples will be filtered if their specified numeric +field value is below this parameter.

    • +
    • max_value – The max filter value in SpecifiedNumericField +op, samples will be filtered if their specified numeric +field value exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    @@ -1527,8 +1559,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1546,8 +1578,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1562,25 +1594,30 @@
    -
    -class data_juicer.ops.filter.VideoDurationFilter(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.StopWordsFilter(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose videos’ durations are within a specified range.

    +

    Filter to keep samples with stopword ratio larger than a specific min +value.

    -
    -__init__(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_duration – The min video duration to keep samples in seconds. -It’s 0 by default.

    • -
    • max_duration – The max video duration to keep samples in seconds. -It’s sys.maxsize by default.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • +
    • lang – Consider stopwords in what language. If lang == +“all”, we will adopt the one merged from all the available +languages

    • +
    • tokenization – whether to use model to tokenize documents

    • +
    • min_ratio – The min filter ratio in this op.

    • +
    • stopwords_dir – The directory storing the stopwords +file(s) whose name includes “stopwords” and in json format

    • +
    • use_words_aug – Whether to augment words, especially for +Chinese and Vietnamese

    • +
    • words_aug_group_sizes – The group size of words to augment

    • +
    • words_aug_join_char – The join char between words to +augment

    • args – extra args

    • kwargs – extra args

    @@ -1589,8 +1626,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1608,8 +1645,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1624,30 +1661,29 @@
    -
    -class data_juicer.ops.filter.TextActionFilter(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.SuffixFilter(suffixes: str | List[str] = [], *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep texts those contain actions in the text.

    +

    Filter to keep samples with specified suffix.

    -
    -__init__(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]
    +
    +__init__(suffixes: str | List[str] = [], *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – language of the text in the samples. ‘en’ for detection of -actions in English and ‘zh’ for detection of actions in Chinese.

    • -
    • mini_action_num – The min action number in the filtering. samples -will be filtered if their action number in the text is below this -parameter.

    • +
    • suffixes – the suffix of text that will be keep. +For example: ‘.txt’, ‘txt’ or [‘txt’, ‘.pdf’, ‘docx’]

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1665,8 +1701,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1681,49 +1717,30 @@
    -
    -class data_juicer.ops.filter.VideoOcrAreaRatioFilter(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.TextActionFilter(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose detected text area ratios for specified frames -in the video are within a specified range.

    +

    Filter to keep texts those contain actions in the text.

    -
    -__init__(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • min_area_ratio – The min ocr area ratio to keep samples. It’s 0 -by default.

    • -
    • max_area_ratio – The max ocr area ratio to keep samples. It’s 1.0 -by default.

    • -
    • frame_sample_num – The number of sampled frames to calculate the -ocr area ratio. If it’s 1, only middle frame will be selected. If -it’s 2, only the first and the last frames will be selected. If -it’s larger than 2, in addition to the first and the last frames, -other frames will be sampled evenly within the video duration.

    • -
    • languages_to_detect – texts in which languages should be -detected. Default: [‘ch_sim’, ‘en’]. Full language list can be -found here: https://www.jaided.ai/easyocr/.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
      +
      Parameters:
      +
        +
      • lang – language of the text in the samples. ‘en’ for detection of +actions in English and ‘zh’ for detection of actions in Chinese.

      • +
      • mini_action_num – The min action number in the filtering. samples +will be filtered if their action number in the text is below this +parameter.

    -
    -get_reader(rank)[source]
    -
    - -
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1741,8 +1758,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1757,52 +1774,34 @@
    -
    -class data_juicer.ops.filter.VideoNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.TextEntityDependencyFilter(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose videos have low nsfw scores.

    +

    Identify the entities in the text which are independent with other token, +and filter them. The text containing no entities will be omitted.

    -
    -__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_nsfw_model – nsfw detection model name on huggingface.

    • -
    • score_threshold – the nsfw score threshold for samples. -range from 0 to 1. Samples with nsfw score less than this threshold -will be kept.

    • -
    • frame_sampling_method – sampling method of extracting frame -images from the videos. -Should be one of [“all_keyframes”, “uniform”]. -The former one extracts all key frames (the number of which depends -on the duration of the video) and the latter one extract specified -number of frames uniformly from the video. -Default: “all_keyframes”.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

    • -
    • reduce_mode – reduce mode for multiple sampled video frames. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • lang – language of the text in the samples. ‘en’ for detection of +entities in English and ‘zh’ for detection of entities in Chinese.

    • +
    • mini_dependency_num – The min token number in the filtering. +Objects is independent if their number of edges in the dependency +tree is below this parameter.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy. +‘any’: keep this sample if any objet is dependent. ‘all’: keep this +sample only if all images are dependent.

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1820,8 +1819,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1836,23 +1835,23 @@
    -
    -class data_juicer.ops.filter.SpecialCharactersFilter(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.TextLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with special-char ratio within a specific +

    Filter to keep samples with total text length within a specific range.

    -
    -__init__(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]
    +
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_ratio – The min filter ratio in this op, samples will -be filtered if their special-char ratio is below this +

    • min_len – The min text length in the filtering. samples +will be filtered if their text length is below this parameter.

    • -
    • max_ratio – The max filter ratio in this op, samples will -be filtered if their special-char ratio exceeds this +

    • max_len – The max text length in the filtering. samples +will be filtered if their text length exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    • @@ -1862,60 +1861,37 @@
    -
    -compute_stats_batched(samples)[source]
    +
    +compute_stats_batched(samples)[source]
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.filter.VideoFramesTextSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.TokenNumFilter(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples those similarities between sampled video frame -images and text within a specific range.

    +

    Filter to keep samples with total token number within a specific +range.

    -
    -__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +__init__(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_clip – clip model name on huggingface to compute -the similarity between frame image and text. It’s kind of -language-related. For example, for Chinese datasets, ChineseCLIP -might be a better choice.

    • -
    • min_score – the min similarity to keep samples.

    • -
    • max_score – the max similarity to keep samples.

    • -
    • frame_sampling_method – sampling method of extracting frame -images from the videos. -Should be one of [“all_keyframes”, “uniform”]. -The former one extracts all key frames (the number of which depends -on the duration of the video) and the latter one extract specified -number of frames uniformly from the video. -Default: “all_keyframes”.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

    • -
    • horizontal_flip – flip frame image horizontally (left to right).

    • -
    • vertical_flip – flip frame image vertically (top to bottom).

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • -
    • reduce_mode – reduce mode when one text corresponds to -multiple video frame images in a chunk. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • +
    • hf_tokenizer – the tokenizer name of Hugging Face tokenizers.

    • +
    • min_num – The min filter token number in this op, samples +will be filtered if their token number is below this +parameter.

    • +
    • max_num – The max filter token number in this op, samples +will be filtered if their token number exceeds this +parameter.

    • args – extra args

    • kwargs – extra args

    @@ -1924,8 +1900,8 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1943,8 +1919,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1959,34 +1935,57 @@
    -
    -class data_juicer.ops.filter.ImageAspectRatioFilter(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with image aspect ratio within a specific range. -AspectRatio = W / H.

    +

    Filter to keep data samples with aesthetics scores for specified frames +in the videos within a specific range.

    -
    -__init__(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_ratio – The min aspect ratio to keep samples.

    • -
    • max_ratio – The max aspect ratio to keep samples.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +

    • hf_scorer_model – Huggingface model name for the aesthetics +predictor. By default, we will use +‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’, +refer to pypi.org/project/simple-aesthetics-predictor

    • +
    • min_score – Min score for the predicted aesthetics in a video.

    • +
    • max_score – Max score for the predicted aesthetics in a video.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. +Should be one of [“all_keyframes”, “uniform”]. +The former one extracts all key frames and the latter one extract +specified number of frames uniformly from the video. +Default: “uniform” with frame_num=3, considering that the number of +keyframes can be large while their difference is usually small +in terms of their aesthetics.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • reduce_mode – reduce mode when one sample corresponds to +multiple frames, must be one of [‘avg’,’max’, ‘min’]. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • +
    • args – Extra positional arguments.

    • +
    • kwargs – Extra keyword arguments.

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2004,8 +2003,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2020,24 +2019,25 @@
    -
    -class data_juicer.ops.filter.AudioDurationFilter(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoAspectRatioFilter(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose audios’ durations are within a specified range.

    +

    Filter to keep samples with video aspect ratio within a specific range. +AspectRatio = W / H.

    -
    -__init__(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_duration – The min audio duration to keep samples in seconds. -It’s 0 by default.

    • -
    • max_duration – The max audio duration to keep samples in seconds. -It’s sys.maxsize by default.

    • +
    • min_ratio – The minimum aspect ratio to keep samples, +supported format is a string, such as “9:21” or “9/21”.

    • +
    • max_ratio – The maximum aspect ratio to keep samples, +supported format is a string, such as “21:9” or “21/9”.

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all audios. ‘any’: keep this sample if any audios meet the -condition. ‘all’: keep this sample only if all audios meet the +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -2047,8 +2047,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2066,8 +2066,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2082,21 +2082,25 @@
    -
    -class data_juicer.ops.filter.LanguageIDScoreFilter(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoDurationFilter(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples in a specific language with confidence score -larger than a specific min value.

    +

    Keep data samples whose videos’ durations are within a specified range.

    -
    -__init__(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]
    +
    +__init__(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – Samples in which languages to keep.

    • -
    • min_score – The min language identification confidence -scores of samples to keep.

    • +
    • min_duration – The min video duration to keep samples in seconds. +It’s 0 by default.

    • +
    • max_duration – The max video duration to keep samples in seconds. +It’s sys.maxsize by default.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -2105,8 +2109,8 @@
    -
    -compute_stats_single(sample)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2124,8 +2128,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2140,19 +2144,48 @@
    -
    -class data_juicer.ops.filter.SuffixFilter(suffixes: str | List[str] = [], *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoFramesTextSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with specified suffix.

    +

    Filter to keep samples those similarities between sampled video frame +images and text within a specific range.

    -
    -__init__(suffixes: str | List[str] = [], *args, **kwargs)[source]
    +
    +__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • suffixes – the suffix of text that will be keep. -For example: ‘.txt’, ‘txt’ or [‘txt’, ‘.pdf’, ‘docx’]

    • +
    • hf_clip – clip model name on huggingface to compute +the similarity between frame image and text. It’s kind of +language-related. For example, for Chinese datasets, ChineseCLIP +might be a better choice.

    • +
    • min_score – the min similarity to keep samples.

    • +
    • max_score – the max similarity to keep samples.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. +Should be one of [“all_keyframes”, “uniform”]. +The former one extracts all key frames (the number of which depends +on the duration of the video) and the latter one extract specified +number of frames uniformly from the video. +Default: “all_keyframes”.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • horizontal_flip – flip frame image horizontally (left to right).

    • +
    • vertical_flip – flip frame image vertically (top to bottom).

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the +condition.

    • +
    • reduce_mode – reduce mode when one text corresponds to +multiple video frame images in a chunk. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • args – extra args

    • kwargs – extra args

    @@ -2161,8 +2194,8 @@
    -
    -compute_stats_single(sample)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2180,8 +2213,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2196,25 +2229,37 @@
    -
    -class data_juicer.ops.filter.ImageSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoMotionScoreFilter(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose image size (in Bytes/KB/MB/…) within a -specific range.

    +

    Filter to keep samples with video motion scores within a specific range. The +Farneback’s algorith from OpenCV is used to compute dense optical flow.

    -
    -__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_size – The min image size to keep samples. set to be “0” by -default for no size constraint

    • -
    • max_size – The max image size to keep samples. set to be -“1TB” by default, an approximate for un-limited case

    • +
    • min_score – The minimum motion score to keep samples.

    • +
    • max_score – The maximum motion score to keep samples.

    • +
    • sampling_fps – The sampling rate in frames_per_second for +optical flow calculations.

    • +
    • size – Resize frames before computing optical flow. If size is a +sequence like (h, w), frame size will be matched to this. If size +is an int, smaller edge of frames will be matched to this number. +i.e, if height > width, then frame will be rescaled to (size * +height / width, size). Default None to keep the original size.

    • +
    • max_size – The maximum allowed for the longer edge of resized +frames. If the longer edge of frames is greater than max_size after +being resized according to size, size will be overruled so that the +longer edge is equal to max_size. As a result, the smaller edge may +be shorter than size. This is only supported if size is an int.

    • +
    • relative – If True, the optical flow magnitude is normalized to +a [0, 1] range, relative to the frame’s diagonal length.

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -2224,8 +2269,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2243,8 +2288,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2259,23 +2304,21 @@
    -
    -class data_juicer.ops.filter.VideoWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose videos have no watermark with high -probability.

    +

    Filter to keep samples whose videos have low nsfw scores.

    -
    -__init__(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_watermark_model – watermark detection model name on -huggingface.

    • -
    • prob_threshold – the predicted watermark probability threshold -for samples. range from 0 to 1. Samples with watermark probability -less than this threshold will be kept.

    • +
    • hf_nsfw_model – nsfw detection model name on huggingface.

    • +
    • score_threshold – the nsfw score threshold for samples. +range from 0 to 1. Samples with nsfw score less than this threshold +will be kept.

    • frame_sampling_method – sampling method of extracting frame images from the videos. Should be one of [“all_keyframes”, “uniform”]. @@ -2305,8 +2348,8 @@

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2324,8 +2367,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2340,26 +2383,34 @@
    -
    -class data_juicer.ops.filter.WordsNumFilter(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoOcrAreaRatioFilter(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with total words number within a specific -range.

    +

    Keep data samples whose detected text area ratios for specified frames +in the video are within a specified range.

    -
    -__init__(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – sample in which language.

    • -
    • tokenization – whether to use model to tokenize documents

    • -
    • min_num – The min filter word number in this op, samples -will be filtered if their word number is below this -parameter.

    • -
    • max_num – The max filter word number in this op, samples -will be filtered if their word number exceeds this -parameter.

    • +
    • min_area_ratio – The min ocr area ratio to keep samples. It’s 0 +by default.

    • +
    • max_area_ratio – The max ocr area ratio to keep samples. It’s 1.0 +by default.

    • +
    • frame_sample_num – The number of sampled frames to calculate the +ocr area ratio. If it’s 1, only middle frame will be selected. If +it’s 2, only the first and the last frames will be selected. If +it’s larger than 2, in addition to the first and the last frames, +other frames will be sampled evenly within the video duration.

    • +
    • languages_to_detect – texts in which languages should be +detected. Default: [‘ch_sim’, ‘en’]. Full language list can be +found here: https://www.jaided.ai/easyocr/.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -2368,47 +2419,75 @@
    -
    -compute_stats_batched(samples, context=False)[source]
    +
    +get_reader(rank)[source]
    -
    -process_batched(samples)[source]
    -
    +
    +compute_stats_single(sample, rank=None, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    +
    +
    Parameters:
    +
      +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    • +
    +
    +
    Returns:
    +

    sample with computed stats

    +
    +
    +
    + +
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> Boolean.

    +
    +
    Parameters:
    +

    sample – sample to decide whether to filter

    +
    +
    Returns:
    +

    true for keeping and false for filtering

    +
    +
    +
    -
    -class data_juicer.ops.filter.ImageFaceCountFilter(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoResolutionFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with the number of faces within a specific range.

    +

    Keep data samples whose videos’ resolutions are within a specified range.

    -
    -__init__(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • cv_classifier – OpenCV classifier path for face detection. -By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • -
    • min_face_count – Minimum number of faces required for samples.

    • -
    • max_face_count – Maximum number of faces required for samples.

    • -
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +

    • min_width – The min horizontal resolution.

    • +
    • max_width – The max horizontal resolution.

    • +
    • min_height – The min vertical resolution.

    • +
    • max_height – The max vertical resolution.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • -
    • args – Extra positional arguments.

    • -
    • kwargs – Extra keyword arguments.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2426,8 +2505,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2442,35 +2521,51 @@
    -
    -class data_juicer.ops.filter.ImageFaceRatioFilter(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoTaggingFromFramesFilter(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with face area ratios within a specific range.

    +

    Filter to keep samples whose videos contain the given tags.

    -
    -__init__(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • cv_classifier – OpenCV classifier path for face detection. -By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • -
    • min_ratio – Min ratio for the largest face area in an image.

    • -
    • max_ratio – Max ratio for the largest face area in an image.

    • -
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +

    • tags – a tag list to shift the videos, total tags can be found +in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt # noqa: E501

    • +
    • contain – require the videos containing ‘any’ or ‘all’ tags. +When tags equal to [], ‘all’ keeps all samples, ‘any’ keeps no +sample.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. Should be one of +[“all_keyframes”, “uniform”]. +The former one extracts all key frames (the number of which depends +on the duration of the video) and the latter one extract specified +number of frames uniformly from the video. +Default: “all_keyframes”.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • tag_field_name – the field name to store the tags. It’s +“__dj__video_frame_tags__” in default.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • -
    • args – Extra positional arguments.

    • -
    • kwargs – Extra keyword arguments.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2488,8 +2583,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2504,31 +2599,44 @@
    -
    -class data_juicer.ops.filter.FlaggedWordFilter(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with flagged-word ratio less than a specific max -value.

    +

    Filter to keep samples whose videos have no watermark with high +probability.

    -
    -__init__(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +
    +__init__(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – Consider flagged words in what language. If lang == -“all”, we will adopt the one merged from all the available -languages

    • -
    • tokenization – Whether to use model to tokenize documents

    • -
    • max_ratio – The max filter ratio in this op.

    • -
    • flagged_words_dir – The directory storing the -flagged_words file(s) whose name includes “flagged_words” -and in json format

    • -
    • use_words_aug – Whether to augment words, especially for -Chinese and Vietnamese

    • -
    • words_aug_group_sizes – The group size of words to augment

    • -
    • words_aug_join_char – The join char between words to -augment

    • +
    • hf_watermark_model – watermark detection model name on +huggingface.

    • +
    • prob_threshold – the predicted watermark probability threshold +for samples. range from 0 to 1. Samples with watermark probability +less than this threshold will be kept.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. +Should be one of [“all_keyframes”, “uniform”]. +The former one extracts all key frames (the number of which depends +on the duration of the video) and the latter one extract specified +number of frames uniformly from the video. +Default: “all_keyframes”.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • reduce_mode – reduce mode for multiple sampled video frames. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -2537,8 +2645,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2556,8 +2664,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2613,38 +2721,26 @@
    -
    -class data_juicer.ops.filter.VideoMotionScoreFilter(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.WordsNumFilter(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with video motion scores within a specific range. The -Farneback’s algorith from OpenCV is used to compute dense optical flow.

    +

    Filter to keep samples with total words number within a specific +range.

    -
    -__init__(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_score – The minimum motion score to keep samples.

    • -
    • max_score – The maximum motion score to keep samples.

    • -
    • sampling_fps – The sampling rate in frames_per_second for -optical flow calculations.

    • -
    • size – Resize frames before computing optical flow. If size is a -sequence like (h, w), frame size will be matched to this. If size -is an int, smaller edge of frames will be matched to this number. -i.e, if height > width, then frame will be rescaled to (size * -height / width, size). Default None to keep the original size.

    • -
    • max_size – The maximum allowed for the longer edge of resized -frames. If the longer edge of frames is greater than max_size after -being resized according to size, size will be overruled so that the -longer edge is equal to max_size. As a result, the smaller edge may -be shorter than size. This is only supported if size is an int.

    • -
    • relative – If True, the optical flow magnitude is normalized to -a [0, 1] range, relative to the frame’s diagonal length.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • +
    • lang – sample in which language.

    • +
    • tokenization – whether to use model to tokenize documents

    • +
    • min_num – The min filter word number in this op, samples +will be filtered if their word number is below this +parameter.

    • +
    • max_num – The max filter word number in this op, samples +will be filtered if their word number exceeds this +parameter.

    • args – extra args

    • kwargs – extra args

    @@ -2653,110 +2749,14 @@
    -
    -compute_stats_single(sample, context=False)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    - -
    -
    -process_single(sample)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    - -
    - -
    -
    -class data_juicer.ops.filter.ImagePairSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    -

    Bases: Filter

    -

    Filter to keep image pairs with similarities between images -within a specific range.

    -
    -
    -__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    -
    param hf_clip:
    -

    clip model name on huggingface to compute -the similarity between image and text.

    -
    -
    param min_score:
    -

    The min similarity to keep samples.

    -
    -
    param max_score:
    -

    The max similarity to keep samples.

    -
    -
    param any_or_all:
    -

    keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    -
    -
    param args:
    -

    extra args

    -
    -
    param kwargs:
    -

    extra args

    -
    -
    -
    -
    - -
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    +
    +compute_stats_batched(samples, context=False)[source]
    +
    -
    -process_single(sample, rank=None)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    +
    +process_batched(samples)[source]
    +
    diff --git a/data_juicer.ops.mapper.html b/data_juicer.ops.mapper.html index baf51b62e..16326ffcb 100644 --- a/data_juicer.ops.mapper.html +++ b/data_juicer.ops.mapper.html @@ -46,53 +46,55 @@
  • data_juicer.ops
  • data_juicer.ops.filter
  • data_juicer.ops.mapper
  • data_juicer.ops.deduplicator
  • @@ -130,53 +132,22 @@

    data_juicer.ops.mapper

    -
    -class data_juicer.ops.mapper.VideoCaptioningFromAudioMapper(keep_original_sample: bool = True, *args, **kwargs)[source]
    -

    Bases: Mapper

    -

    Mapper to caption a video according to its audio streams based on -Qwen-Audio model.

    -
    -
    -__init__(keep_original_sample: bool = True, *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only captioned sample in the -final datasets and the original sample will be removed. It’s True -in default.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -process_batched(samples, rank=None)[source]
    -
    - -
    - -
    -
    -class data_juicer.ops.mapper.VideoTaggingFromAudioMapper(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.AudioFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to generate video tags from audio streams extracted by video -using the Audio Spectrogram Transformer.

    +

    Simple wrapper for FFmpeg audio filters.

    -
    -__init__(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]
    +
    +__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_ast – path to the HF model to tag from audios.

    • -
    • trust_remote_code – whether to trust the remote code of HF models

    • -
    • tag_field_name – the field name to store the tags. It’s -“__dj__video_audio_tags__” in default.

    • +
    • filter_name – ffmpeg audio filter name.

    • +
    • filter_kwargs – keyword-arguments passed to ffmpeg filter.

    • +
    • global_args – list-arguments passed to ffmpeg command-line.

    • +
    • capture_stderr – whether to capture stderr.

    • +
    • overwrite_output – whether to overwrite output file.

    • args – extra args

    • kwargs – extra args

    @@ -185,8 +156,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -201,44 +172,42 @@
    -
    -class data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.ChineseConvertMapper(mode: str = 's2t', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to generate samples whose texts are generated based on -gpt-4-visison and the image.

    +

    Mapper to convert Chinese between Traditional Chinese, Simplified Chinese +and Japanese Kanji.

    -
    -__init__(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(mode: str = 's2t', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • mode – mode of text generated from images, can be one of -[‘resoning’, ‘description’, ‘conversation’, ‘custom’]

    • -
    • api_key – the API key to authenticate the request.

    • -
    • max_token – the maximum number of tokens to generate. -Default is 500.

    • -
    • temperature – controls the randomness of the output (range -from 0 to 1). Default is 0.

    • -
    • system_prompt – a string prompt used to set the context of a -conversation and provide global guidance or rules for the -gpt4-vision so that it can generate responses in the expected way. -If mode set to custom, the parameter will be used.

    • -
    • user_prompt – a string prompt to guide the generation of -gpt4-vision for each samples. It’s “” in default, which means no -prompt provided.

    • -
    • uers_prompt_key – the key name of fields in samples to store -prompts for each sample. It’s used for set different prompts for -different samples. If it’s none, use prompt in parameter “prompt”. -It’s None in default.

    • -
    • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only generated text in the -final datasets and the original text will be removed. It’s True -in default.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    • +
    • mode

      Choose the mode to convert Chinese:

      +

      s2t: Simplified Chinese to Traditional Chinese,

      +

      t2s: Traditional Chinese to Simplified Chinese,

      +

      s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),

      +

      tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,

      +

      s2hk: Simplified Chinese to Traditional Chinese +(Hong Kong variant),

      +

      hk2s: Traditional Chinese (Hong Kong variant) to Simplified +Chinese,

      +

      s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) +with Taiwanese idiom,

      +

      tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese +with Mainland Chinese idiom,

      +

      t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),

      +

      tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,

      +

      hk2t: Traditional Chinese (Hong Kong variant) to Traditional +Chinese,

      +

      t2hk: Traditional Chinese to Traditional Chinese +(Hong Kong variant),

      +

      t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese +Kanji,

      +

      jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese +Characters,

      +

    • args – extra args

    • kwargs – extra args

    @@ -247,21 +216,21 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.PunctuationNormalizationMapper(*args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.CleanCopyrightMapper(*args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to normalize unicode punctuations to English punctuations in text +

    Mapper to clean copyright comments at the beginning of the text samples.

    -
    -__init__(*args, **kwargs)[source]
    +
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -274,25 +243,26 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.RemoveBibliographyMapper(*args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.CleanEmailMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove bibliography at the end of documents in Latex -samples.

    +

    Mapper to clean email in text samples.

    -
    -__init__(*args, **kwargs)[source]
    +
    +__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      +
    • pattern – regular expression pattern to search for within text.

    • +
    • repl – replacement string, default is empty string.

    • args – extra args

    • kwargs – extra args

    @@ -301,25 +271,24 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.SentenceSplitMapper(lang: str = 'en', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.CleanHtmlMapper(*args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to split text samples to sentences.

    +

    Mapper to clean html code in text samples.

    -
    -__init__(lang: str = 'en', *args, **kwargs)[source]
    +
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – split sentence of text in which language.

    • args – extra args

    • kwargs – extra args

    @@ -328,71 +297,26 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.VideoSplitBySceneMapper(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.CleanIpMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to cut videos into scene clips.

    -
    -
    -avaliable_detectors = {'AdaptiveDetector': ['window_width', 'min_content_val', 'weights', 'luma_only', 'kernel_size', 'video_manager', 'min_delta_hsv'], 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], 'ThresholdDetector': ['fade_bias', 'add_final_scene', 'method', 'block_size']}
    -
    - +

    Mapper to clean ipv4 and ipv6 address in text samples.

    -
    -__init__(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]
    +
    +__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • detector – Algorithm from scenedetect.detectors. Should be one -of [‘ContentDetector’, ‘ThresholdDetector’, ‘AdaptiveDetector`].

    • -
    • threshold – Threshold passed to the detector.

    • -
    • min_scene_len – Minimum length of any scene.

    • -
    • show_progress – Whether to show progress from scenedetect.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -process_single(sample, context=False)[source]
    -

    For sample level, sample –> sample

    -
    -
    Parameters:
    -

    sample – sample to process

    -
    -
    Returns:
    -

    processed sample

    -
    -
    -
    - -
    - -
    -
    -class data_juicer.ops.mapper.CleanIpMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    -

    Bases: Mapper

    -

    Mapper to clean ipv4 and ipv6 address in text samples.

    -
    -
    -__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • pattern – regular expression pattern to search for within text.

    • -
    • repl – replacement string, default is empty string.

    • +
    • pattern – regular expression pattern to search for within text.

    • +
    • repl – replacement string, default is empty string.

    • args – extra args

    • kwargs – extra args

    @@ -436,20 +360,18 @@
    -
    -class data_juicer.ops.mapper.RemoveHeaderMapper(drop_no_head: bool = True, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.ExpandMacroMapper(*args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove headers at the beginning of documents in Latex +

    Mapper to expand macro definitions in the document body of Latex samples.

    -
    -__init__(drop_no_head: bool = True, *args, **kwargs)[source]
    +
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • drop_no_head – whether to drop sample texts without -headers.

    • args – extra args

    • kwargs – extra args

    @@ -458,28 +380,27 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.RemoveTableTextMapper(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.FixUnicodeMapper(normalization: str | None = None, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove table texts from text samples.

    -

    Regular expression is used to remove tables in the range of column -number of tables.

    +

    Mapper to fix unicode errors in text samples.

    -
    -__init__(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]
    +
    +__init__(normalization: str | None = None, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_col – The min number of columns of table to remove.

    • -
    • max_col – The max number of columns of table to remove.

    • +
    • normalization – the specified form of Unicode +normalization mode, which can be one of +[‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’], default ‘NFC’.

    • args – extra args

    • kwargs – extra args

    @@ -488,56 +409,104 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.VideoRemoveWatermarkMapper(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.GenerateQAFromExamplesMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', example_num: int[int] = 3, similarity_threshold: float = 0.7, system_prompt: str | None = None, input_template: str | None = None, example_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: Mapper

    -

    Remove the watermarks in videos given regions.

    +

    Mapper to generate question and answer pairs from examples. +You should configure an empty dataset in your yaml config file: +``` +generated_dataset_config:

    +
    +

    type: ‘EmptyFormatter’ # use RayEmptyFormatter when enable ray +length: ${The number of generated samples} +feature_keys: ${text key}

    +
    +

    ``` +The number of samples generated is determined by +the length of the empty dataset.

    +
    +
    +DEFAULT_SYSTEM_PROMPT = '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求:\n1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n'
    +
    + +
    +
    +DEFAULT_INPUT_TEMPLATE = '{}'
    +
    + +
    +
    +DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}'
    +
    + +
    +
    +DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n'
    +
    + +
    +
    +DEFAULT_OUTPUT_PATTERN = '【问题】(.*?)【回答】(.*?)(?=【问题】|$)'
    +
    +
    -
    -__init__(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +
    +__init__(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', example_num: int[int] = 3, similarity_threshold: float = 0.7, system_prompt: str | None = None, input_template: str | None = None, example_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • roi_strings – a given list of regions the watermarks locate. -The format of each can be “x1, y1, x2, y2”, “(x1, y1, x2, y2)”, -or “[x1, y1, x2, y2]”.

    • -
    • roi_type – the roi string type. When the type is ‘pixel’, (x1, -y1), (x2, y2) are the locations of pixels in the top left corner -and the bottom right corner respectively. If the roi_type is -‘ratio’, the coordinates are normalized by wights and heights.

    • -
    • roi_key – the key name of fields in samples to store roi_strings -for each sample. It’s used for set different rois for different -samples. If it’s none, use rois in parameter “roi_strings”. -It’s None in default.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video to detect the pixels of watermark.

    • -
    • min_frame_threshold – a coodination is considered as the -location of a watermark pixel when it is that in no less -min_frame_threshold frames.

    • -
    • detection_method – the method to detect the pixels of watermark. -If it is ‘pixel_value’, we consider the distribution of pixel -value in each frame. If it is ‘pixel_diversity’, we will consider -the pixel diversity in different frames. The min_frame_threshold -is useless and frame_num must be greater than 1 in -‘pixel_diversity’ mode.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • hf_model – Hugginface model ID.

    • +
    • seed_file – Path to the seed file in chatml format.

    • +
    • example_num – The number of selected examples. +Randomly select N examples from “seed_file” and +put them into prompt as QA examples.

    • +
    • similarity_threshold – The similarity score threshold +between the generated samples and the seed examples. +Range from 0 to 1. Samples with similarity score less than +this threshold will be kept.

    • +
    • system_prompt – System prompt for guiding the generation task.

    • +
    • input_template – Template for building the input prompt. It must +include one placeholder ‘{}’, which will be replaced by +example_num formatted examples defined by example_template.

    • +
    • example_template – Template for formatting one QA example. It +must include one placeholder ‘{}’, which will be replaced by one +formatted qa_pair.

    • +
    • qa_pair_template – Template for formatting a single QA pair +within each example. Must include two placeholders ‘{}’ for the +question and answer.

    • +
    • output_pattern – Regular expression pattern to extract questions +and answers from model response.

    • +
    • enable_vllm – Whether to use vllm for inference acceleration.

    • +
    • model_params – Parameters for initializing the model.

    • +
    • sampling_params – Sampling parameters for text generation. +e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

    • +
    • kwargs – Extra keyword arguments.

    -
    -process_single(sample, context=False)[source]
    +
    +build_input(qa_examples)[source]
    +
    + +
    +
    +parse_output(raw_output)[source]
    +
    + +
    +
    +process_single(sample=None, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -552,102 +521,308 @@
    -
    -class data_juicer.ops.mapper.RemoveRepeatSentencesMapper(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.GenerateQAFromTextMapper(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove repeat sentences in text samples.

    +

    Mapper to generate question and answer pairs from text. +Recommended model list: [

    +
    +

    ‘alibaba-pai/pai-llama3-8b-doc2qa’, +‘alibaba-pai/pai-baichuan2-7b-doc2qa’, +‘alibaba-pai/pai-qwen1_5-4b-doc2qa’, +‘alibaba-pai/pai-qwen1_5-7b-doc2qa’, +‘alibaba-pai/pai-qwen1_5-1b8-doc2qa’, +‘alibaba-pai/pai-qwen1_5-0b5-doc2qa’

    +
    +

    ] +These recommended models are all trained with Chinese data +and are suitable for Chinese.

    -
    -__init__(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]
    +
    +__init__(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lowercase – Whether to convert sample text to lower case

    • -
    • ignore_special_character – Whether to ignore special -characters when judging repeated sentences. Special characters -are all characters except Chinese characters, letters and -numbers.

    • -
    • min_repeat_sentence_length – Sentences shorter than this -length will not be deduplicated. If ignore_special_character is -set to True, then special characters are not included in this -length.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • hf_model – Hugginface model ID.

    • +
    • output_pattern – Regular expression pattern to extract +questions and answers from model response.

    • +
    • enable_vllm – Whether to use vllm for inference acceleration.

    • +
    • model_params – Parameters for initializing the model.

    • +
    • sampling_params – Sampling parameters for text generation, +e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

    • +
    • kwargs – Extra keyword arguments.

    +

    The default data format parsed by this interface is as follows: +Model Input:

    +
    +

    蒙古国的首都是乌兰巴托(Ulaanbaatar) +冰岛的首都是雷克雅未克(Reykjavik)

    +
    +
    +
    Model Output:

    蒙古国的首都是乌兰巴托(Ulaanbaatar) +冰岛的首都是雷克雅未克(Reykjavik) +Human: 请问蒙古国的首都是哪里? +Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 +Human: 冰岛的首都是哪里呢? +Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 +…

    +
    +
    -
    -process_batched(samples)[source]
    +
    +parse_output(raw_output)[source]
    +
    + +
    +
    +process_batched(samples, rank=None)[source]
    -
    -class data_juicer.ops.mapper.ImageDiffusionMapper(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.ImageBlurMapper(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]

    Bases: Mapper

    -

    Generate image by diffusion model

    +

    Mapper to blur images.

    -
    -__init__(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +
    +__init__(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_diffusion – diffusion model name on huggingface to generate -the image.

    • -
    • torch_dtype – the floating point type used to load the diffusion -model. Can be one of [‘fp32’, ‘fp16’, ‘bf16’]

    • -
    • revision – The specific model version to use. It can be a -branch name, a tag name, a commit id, or any identifier allowed -by Git.

    • -
    • strength – Indicates extent to transform the reference image. -Must be between 0 and 1. image is used as a starting point and -more noise is added the higher the strength. The number of -denoising steps depends on the amount of noise initially added. -When strength is 1, added noise is maximum and the denoising -process runs for the full number of iterations specified in -num_inference_steps. A value of 1 essentially ignores image.

    • -
    • guidance_scale – A higher guidance scale value encourages the -model to generate images closely linked to the text prompt at the -expense of lower image quality. Guidance scale is enabled when -guidance_scale > 1.

    • -
    • aug_num – The image number to be produced by stable-diffusion -model.

    • -
    • keep_candidate_mode

      retain strategy for the generated -$caption_num$ candidates.

      -

      ’random_any’: Retain the random one from generated captions

      -
      -
      ’similar_one_simhash’: Retain the generated one that is most

      similar to the original caption

      -
      -
      -

      ’all’: Retain all generated captions by concatenation

      -

    • +
    • p – Probability of the image being blured.

    • +
    • blur_type – Type of blur kernel, including +[‘mean’, ‘box’, ‘gaussian’].

    • +
    • radius – Radius of blur kernel.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -

    Note

    -

    This is a batched_OP, whose input and output type are -both list. Suppose there are $N$ list of input samples, whose batch -size is $b$, and denote caption_num as $M$. -The number of total samples after generation is $2Nb$ when -keep_original_sample is True and $Nb$ when keep_original_sample is -False. For ‘random_any’ and ‘similar_one_simhash’ mode, -it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True -and $MNb$ when keep_original_sample is False.

    -
    +
    + +
    +
    +process_single(sample, context=False)[source]
    +

    For sample level, sample –> sample

    Parameters:
    -
      -
    • caption_key – the key name of fields in samples to store captions -for each images. It can be a string if there is only one image in -each sample. Otherwise, it should be a list. If it’s none, -ImageDiffusionMapper will produce captions for each images.

    • +

      sample – sample to process

      +
      +
      Returns:
      +

      processed sample

      +
      +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Mapper to generate samples whose texts are generated based on +gpt-4-visison and the image.

    +
    +
    +__init__(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • mode – mode of text generated from images, can be one of +[‘resoning’, ‘description’, ‘conversation’, ‘custom’]

    • +
    • api_key – the API key to authenticate the request.

    • +
    • max_token – the maximum number of tokens to generate. +Default is 500.

    • +
    • temperature – controls the randomness of the output (range +from 0 to 1). Default is 0.

    • +
    • system_prompt – a string prompt used to set the context of a +conversation and provide global guidance or rules for the +gpt4-vision so that it can generate responses in the expected way. +If mode set to custom, the parameter will be used.

    • +
    • user_prompt – a string prompt to guide the generation of +gpt4-vision for each samples. It’s “” in default, which means no +prompt provided.

    • +
    • uers_prompt_key – the key name of fields in samples to store +prompts for each sample. It’s used for set different prompts for +different samples. If it’s none, use prompt in parameter “prompt”. +It’s None in default.

    • +
    • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only generated text in the +final datasets and the original text will be removed. It’s True +in default.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_batched(samples)[source]
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.ImageCaptioningMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Mapper to generate samples whose captions are generated based on +another model and the figure.

    +
    +
    +__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • hf_img2seq – model name on huggingface to generate caption

    • +
    • caption_num – how many candidate captions to generate +for each image

    • +
    • keep_candidate_mode

      retain strategy for the generated +$caption_num$ candidates.

      +

      ’random_any’: Retain the random one from generated captions

      +
      +
      ’similar_one_simhash’: Retain the generated one that is most

      similar to the original caption

      +
      +
      +

      ’all’: Retain all generated captions by concatenation

      +

    • +
    +
    +
    +
    +

    Note

    +

    This is a batched_OP, whose input and output type are +both list. Suppose there are $N$ list of input samples, whose batch +size is $b$, and denote caption_num as $M$. +The number of total samples after generation is $2Nb$ when +keep_original_sample is True and $Nb$ when keep_original_sample is +False. For ‘random_any’ and ‘similar_one_simhash’ mode, +it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True +and $MNb$ when keep_original_sample is False.

    +
    +
    +
    Parameters:
    +
      +
    • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only generated captions in the +final datasets and the original captions will be removed. It’s True +in default.

    • +
    • prompt – a string prompt to guide the generation of blip2 model +for all samples globally. It’s None in default, which means no +prompt provided.

    • +
    • prompt_key – the key name of fields in samples to store prompts +for each sample. It’s used for set different prompts for different +samples. If it’s none, use prompt in parameter “prompt”. It’s None +in default.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_batched(samples, rank=None)[source]
    +
    +

    Note

    +

    This is a batched_OP, whose input and output type are +both list. Suppose there are $N$ input sample list with batch +size as $b$, and denote caption_num as $M$. +the number of total samples after generation is $2Nb$ +for ‘random_any’ and ‘similar_one’ mode, +and $(1+M)Nb$ for ‘all’ mode.

    +
    +
    +
    Parameters:
    +

    samples

    +
    +
    Returns:
    +

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.ImageDiffusionMapper(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Generate image by diffusion model

    +
    +
    +__init__(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • hf_diffusion – diffusion model name on huggingface to generate +the image.

    • +
    • torch_dtype – the floating point type used to load the diffusion +model. Can be one of [‘fp32’, ‘fp16’, ‘bf16’]

    • +
    • revision – The specific model version to use. It can be a +branch name, a tag name, a commit id, or any identifier allowed +by Git.

    • +
    • strength – Indicates extent to transform the reference image. +Must be between 0 and 1. image is used as a starting point and +more noise is added the higher the strength. The number of +denoising steps depends on the amount of noise initially added. +When strength is 1, added noise is maximum and the denoising +process runs for the full number of iterations specified in +num_inference_steps. A value of 1 essentially ignores image.

    • +
    • guidance_scale – A higher guidance scale value encourages the +model to generate images closely linked to the text prompt at the +expense of lower image quality. Guidance scale is enabled when +guidance_scale > 1.

    • +
    • aug_num – The image number to be produced by stable-diffusion +model.

    • +
    • keep_candidate_mode

      retain strategy for the generated +$caption_num$ candidates.

      +

      ’random_any’: Retain the random one from generated captions

      +
      +
      ’similar_one_simhash’: Retain the generated one that is most

      similar to the original caption

      +
      +
      +

      ’all’: Retain all generated captions by concatenation

      +

    • +
    +
    +
    +
    +

    Note

    +

    This is a batched_OP, whose input and output type are +both list. Suppose there are $N$ list of input samples, whose batch +size is $b$, and denote caption_num as $M$. +The number of total samples after generation is $2Nb$ when +keep_original_sample is True and $Nb$ when keep_original_sample is +False. For ‘random_any’ and ‘similar_one_simhash’ mode, +it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True +and $MNb$ when keep_original_sample is False.

    +
    +
    +
    Parameters:
    +
      +
    • caption_key – the key name of fields in samples to store captions +for each images. It can be a string if there is only one image in +each sample. Otherwise, it should be a list. If it’s none, +ImageDiffusionMapper will produce captions for each images.

    • hf_img2seq – model name on huggingface to generate caption if caption_key is None.

    @@ -718,22 +893,21 @@
    -
    -class data_juicer.ops.mapper.VideoFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.ImageTaggingMapper(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]

    Bases: Mapper

    -

    Simple wrapper for FFmpeg video filters.

    +

    Mapper to generate image tags.

    -
    -__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    -

    Initialization method.

    +
    +__init__(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]
    +

    Initialization method. +:param tag_field_name: the field name to store the tags. It’s

    +
    +

    “__dj__image_tags__” in default.

    +
    Parameters:
      -
    • filter_name – ffmpeg video filter name.

    • -
    • filter_kwargs – keyword-arguments passed to ffmpeg filter.

    • -
    • global_args – list-arguments passed to ffmpeg command-line.

    • -
    • capture_stderr – whether to capture stderr.

    • -
    • overwrite_output – whether to overwrite output file.

    • args – extra args

    • kwargs – extra args

    @@ -742,8 +916,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -757,43 +931,61 @@
    -
    -
    -class data_juicer.ops.mapper.ChineseConvertMapper(mode: str = 's2t', *args, **kwargs)[source]
    -

    Bases: Mapper

    -

    Mapper to convert Chinese between Traditional Chinese, Simplified Chinese -and Japanese Kanji.

    -
    -
    -__init__(mode: str = 's2t', *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • mode

      Choose the mode to convert Chinese:

      -

      s2t: Simplified Chinese to Traditional Chinese,

      -

      t2s: Traditional Chinese to Simplified Chinese,

      -

      s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),

      -

      tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,

      -

      s2hk: Simplified Chinese to Traditional Chinese -(Hong Kong variant),

      -

      hk2s: Traditional Chinese (Hong Kong variant) to Simplified -Chinese,

      -

      s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) -with Taiwanese idiom,

      -

      tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese -with Mainland Chinese idiom,

      -

      t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),

      -

      tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,

      -

      hk2t: Traditional Chinese (Hong Kong variant) to Traditional -Chinese,

      -

      t2hk: Traditional Chinese to Traditional Chinese -(Hong Kong variant),

      -

      t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese -Kanji,

      -

      jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese -Characters,

      -

    • +
      +
      +class data_juicer.ops.mapper.NlpaugEnMapper(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
      +

      Bases: Mapper

      +

      Mapper to simply augment samples in English based on nlpaug library.

      +
      +
      +__init__(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
      +

      Initialization method. All augmentation methods use default parameters +in default. We recommend you to only use 1-3 augmentation methods at a +time. Otherwise, the semantics of samples might be changed +significantly.

      +
      +
      Parameters:
      +
        +
      • sequential – whether combine all augmentation methods to a +sequence. If it’s True, a sample will be augmented by all opened +augmentation methods sequentially. If it’s False, each opened +augmentation method would generate its augmented samples +independently.

      • +
      • aug_num – number of augmented samples to be generated. If +sequential is True, there will be total aug_num augmented samples +generated. If it’s False, there will be (aug_num * +#opened_aug_method) augmented samples generated.

      • +
      • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only generated texts in the final +datasets and the original texts will be removed. It’s True in +default.

      • +
      • delete_random_word – whether to open the augmentation method of +deleting random words from the original texts. e.g. “I love LLM” +–> “I LLM”

      • +
      • swap_random_word – whether to open the augmentation method of +swapping random contiguous words in the original texts. e.g. “I +love LLM” –> “Love I LLM”

      • +
      • spelling_error_word – whether to open the augmentation method of +simulating the spelling error for words in the original texts. e.g. +“I love LLM” –> “Ai love LLM”

      • +
      • split_random_word – whether to open the augmentation method of +splitting words randomly with whitespaces in the original texts. +e.g. “I love LLM” –> “I love LL M”

      • +
      • keyboard_error_char – whether to open the augmentation method of +simulating the keyboard error for characters in the original texts. +e.g. “I love LLM” –> “I ;ov4 LLM”

      • +
      • ocr_error_char – whether to open the augmentation method of +simulating the OCR error for characters in the original texts. +e.g. “I love LLM” –> “I 10ve LLM”

      • +
      • delete_random_char – whether to open the augmentation method of +deleting random characters from the original texts. e.g. “I love +LLM” –> “I oe LLM”

      • +
      • swap_random_char – whether to open the augmentation method of +swapping random contiguous characters in the original texts. +e.g. “I love LLM” –> “I ovle LLM”

      • +
      • insert_random_char – whether to open the augmentation method of +inserting random characters into the original texts. e.g. “I love +LLM” –> “I ^lKove LLM”

      • args – extra args

      • kwargs – extra args

      @@ -802,8 +994,8 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      @@ -868,88 +1060,71 @@
      -
      -class data_juicer.ops.mapper.OptimizeInstructionMapper(hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine', trust_remote_code: bool = False, system_prompt: str | None = None, enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.OptimizeQAMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to optimize instruction. -Recommended model list: [

      -
      -

      alibaba-pai/Qwen2-1.5B-Instruct-Refine -alibaba-pai/Qwen2-7B-Instruct-Refine

      -
      -

      ]

      -
      -
      -__init__(hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine', trust_remote_code: bool = False, system_prompt: str | None = None, enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      -

      Initialization method. -:param hf_model: Hugginface model id. -:param trust_remote_code: passed to transformers -:param system_prompt: System prompt for optimize samples. -:param enable_vllm: Whether to use vllm for inference acceleration. -:param tensor_parallel_size: It is only valid when enable_vllm is True.

      -
      -

      The number of GPUs to use for distributed execution with tensor -parallelism.

      -
      -
      -
      Parameters:
      -
        -
      • max_model_len – It is only valid when enable_vllm is True. -Model context length. If unspecified, will be automatically -derived from the model config.

      • -
      • max_num_seqs – It is only valid when enable_vllm is True. -Maximum number of sequences to be processed in a single iteration.

      • -
      • sampling_params – Sampling parameters for text generation. -e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      -
      +

      Mapper to optimize question-answer pairs.

      +
      +
      +DEFAULT_SYSTEM_PROMPT = '请优化输入的问答对,使【问题】和【回答】都更加详细、准确。必须按照以下标记格式,直接输出优化后的问答对:\n【问题】\n优化后的问题\n【回答】\n优化后的回答'
      +
      -
      -
      -process_single(sample=None, rank=None)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      -
      -
      -
      +
      +
      +DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}'
      +
      -
      +
      +
      +DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
      +
      + +
      +
      +DEFAULT_OUTPUT_PATTERN = '.*?【问题】\\s*(.*?)\\s*【回答】\\s*(.*)'
      +
      -
      -
      -class data_juicer.ops.mapper.ImageBlurMapper(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to blur images.

      -
      -__init__(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
      +
      +__init__(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • p – Probability of the image being blured.

      • -
      • blur_type – Type of blur kernel, including -[‘mean’, ‘box’, ‘gaussian’].

      • -
      • radius – Radius of blur kernel.

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • +
      • hf_model – Hugging Face model ID.

      • +
      • system_prompt – System prompt for guiding the optimization task.

      • +
      • input_template – Template for building the input for the model. +Please make sure the template contains one placeholder ‘{}’, which +corresponds to the question and answer pair generated by +param qa_pair_template.

      • +
      • qa_pair_template – Template for formatting the question and +answer pair. Please make sure the template contains two +‘{}’ to format question and answer.

      • +
      • output_pattern – Regular expression pattern to extract question +and answer from model response.

      • +
      • enable_vllm – Whether to use VLLM for inference acceleration.

      • +
      • model_params – Parameters for initializing the model.

      • +
      • sampling_params – Sampling parameters for text generation (e.g., +{‘temperature’: 0.9, ‘top_p’: 0.95}).

      • +
      • kwargs – Extra keyword arguments.

      -
      -process_single(sample, context=False)[source]
      +
      +build_input(sample)[source]
      +
      + +
      +
      +parse_output(raw_output)[source]
      +
      + +
      +
      +process_single(sample=None, rank=None)[source]

      For sample level, sample –> sample

      Parameters:
      @@ -964,47 +1139,52 @@
      -
      -class data_juicer.ops.mapper.CleanCopyrightMapper(*args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to clean copyright comments at the beginning of the text -samples.

      +
      +class data_juicer.ops.mapper.OptimizeQueryMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
      +

      Bases: OptimizeQAMapper

      +

      Mapper to optimize query in question-answer pairs.

      +
      +
      +DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。'
      +
      +
      -
      -__init__(*args, **kwargs)[source]
      -

      Initialization method.

      -
      -
      Parameters:
      -
        -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      +
      +parse_output(raw_output)[source]
      +
      +
      +
      +
      +class data_juicer.ops.mapper.OptimizeResponseMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
      +

      Bases: OptimizeQAMapper

      +

      Mapper to optimize response in question-answer pairs.

      +
      +
      +DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。'
      +
      +
      -
      -process_batched(samples)[source]
      +
      +parse_output(raw_output)[source]
      -
      -class data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.PunctuationNormalizationMapper(*args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to remove non chinese Character in text samples.

      +

      Mapper to normalize unicode punctuations to English punctuations in text +samples.

      -
      -__init__(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]
      +
      +__init__(*args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • keep_alphabet – whether to keep alphabet

      • -
      • keep_number – whether to keep number

      • -
      • keep_punc – whether to keep punctuation

      • args – extra args

      • kwargs – extra args

      @@ -1013,28 +1193,25 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.VideoSplitByKeyFrameMapper(keep_original_sample: bool = True, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveBibliographyMapper(*args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to split video by key frame.

      +

      Mapper to remove bibliography at the end of documents in Latex +samples.

      -
      -__init__(keep_original_sample: bool = True, *args, **kwargs)[source]
      +
      +__init__(*args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only split sample in the -final datasets and the original sample will be removed. It’s True -in default.

      • args – extra args

      • kwargs – extra args

      @@ -1043,31 +1220,28 @@
      -
      -get_split_key_frame(video_key, container)[source]
      -
      - -
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.RemoveSpecificCharsMapper(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveCommentsMapper(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to clean specific chars in text samples.

      +

      Mapper to remove comments in different kinds of documents.

      +

      Only support ‘tex’ for now.

      -
      -__init__(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]
      +
      +__init__(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • chars_to_remove – a list or a string including all -characters that need to be removed from text.

      • +
      • doc_type – Type of document to remove comments.

      • +
      • inline – Whether to remove inline comments.

      • +
      • multiline – Whether to remove multiline comments.

      • args – extra args

      • kwargs – extra args

      @@ -1076,77 +1250,57 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.VideoResizeAspectRatioMapper(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveHeaderMapper(drop_no_head: bool = True, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to resize videos by aspect ratio. -AspectRatio = W / H.

      -
      -
      -STRATEGY = ['decrease', 'increase']
      -
      - +

      Mapper to remove headers at the beginning of documents in Latex +samples.

      -
      -__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]
      +
      +__init__(drop_no_head: bool = True, *args, **kwargs)[source]

      Initialization method.

      -
      Parameters:
      -
        -
      • min_ratio – The minimum aspect ratio to enforce videos with -an aspect ratio below min_ratio will be resized to match -this minimum ratio. The ratio should be provided as a string -in the format “9:21” or “9/21”.

      • -
      • max_ratio – The maximum aspect ratio to enforce videos with -an aspect ratio above max_ratio will be resized to match -this maximum ratio. The ratio should be provided as a string -in the format “21:9” or “21/9”.

      • -
      • strategy – The resizing strategy to apply when adjusting the -video dimensions. It can be either ‘decrease’ to reduce the -dimension or ‘increase’ to enlarge it. Accepted values are -[‘decrease’, ‘increase’].

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      -
      - -
      -
      -process_single(sample)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      +
      Parameters:
      +
        +
      • drop_no_head – whether to drop sample texts without +headers.

      • +
      • args – extra args

      • +
      • kwargs – extra args

      • +
      +
      +
      +process_batched(samples)[source]
      +
      +
      -
      -class data_juicer.ops.mapper.CleanHtmlMapper(*args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveLongWordsMapper(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to clean html code in text samples.

      +

      Mapper to remove long words within a specific range.

      -
      -__init__(*args, **kwargs)[source]
      +
      +__init__(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        +
      • min_len – The min mapper word length in this op, words +will be filtered if their length is below this parameter.

      • +
      • max_len – The max mapper word length in this op, words +will be filtered if their length exceeds this parameter.

      • args – extra args

      • kwargs – extra args

      @@ -1155,27 +1309,32 @@
      -
      -process_batched(samples)[source]
      +
      +should_keep_long_word(word)[source]
      +
      + +
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.WhitespaceNormalizationMapper(*args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to normalize different kinds of whitespaces to whitespace ‘ ‘ (0x20) -in text samples.

      -

      Different kinds of whitespaces can be found here: -https://en.wikipedia.org/wiki/Whitespace_character

      +

      Mapper to remove non chinese Character in text samples.

      -
      -__init__(*args, **kwargs)[source]
      +
      +__init__(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        +
      • keep_alphabet – whether to keep alphabet

      • +
      • keep_number – whether to keep number

      • +
      • keep_punc – whether to keep punctuation

      • args – extra args

      • kwargs – extra args

      @@ -1184,39 +1343,33 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.VideoTaggingFromFramesMapper(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveRepeatSentencesMapper(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to generate video tags from frames extract by video.

      +

      Mapper to remove repeat sentences in text samples.

      -
      -__init__(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]
      +
      +__init__(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • frame_sampling_method – sampling method of extracting frame -images from the videos. Should be one of -[“all_keyframes”, “uniform”]. -The former one extracts all key frames (the number of which depends -on the duration of the video) and the latter one extract specified -number of frames uniformly from the video. -Default: “all_keyframes”.

      • -
      • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

      • -
      • tag_field_name – the field name to store the tags. It’s -“__dj__video_frame_tags__” in default.

      • +
      • lowercase – Whether to convert sample text to lower case

      • +
      • ignore_special_character – Whether to ignore special +characters when judging repeated sentences. Special characters +are all characters except Chinese characters, letters and +numbers.

      • +
      • min_repeat_sentence_length – Sentences shorter than this +length will not be deduplicated. If ignore_special_character is +set to True, then special characters are not included in this +length.

      • args – extra args

      • kwargs – extra args

      @@ -1225,37 +1378,26 @@
      -
      -process_single(sample, rank=None, context=False)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      -
      -
      -
      +
      +process_batched(samples)[source]
      +
      -
      -class data_juicer.ops.mapper.RemoveCommentsMapper(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveSpecificCharsMapper(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to remove comments in different kinds of documents.

      -

      Only support ‘tex’ for now.

      +

      Mapper to clean specific chars in text samples.

      -
      -__init__(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]
      +
      +__init__(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • doc_type – Type of document to remove comments.

      • -
      • inline – Whether to remove inline comments.

      • -
      • multiline – Whether to remove multiline comments.

      • +
      • chars_to_remove – a list or a string including all +characters that need to be removed from text.

      • args – extra args

      • kwargs – extra args

      @@ -1264,25 +1406,28 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.ExpandMacroMapper(*args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveTableTextMapper(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to expand macro definitions in the document body of Latex -samples.

      +

      Mapper to remove table texts from text samples.

      +

      Regular expression is used to remove tables in the range of column +number of tables.

      -
      -__init__(*args, **kwargs)[source]
      +
      +__init__(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        +
      • min_col – The min number of columns of table to remove.

      • +
      • max_col – The max number of columns of table to remove.

      • args – extra args

      • kwargs – extra args

      @@ -1291,145 +1436,62 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.ExtractQAMapper(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', trust_remote_code: bool = False, pattern: str | None = None, qa_format: str = 'chatml', enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to extract question and answer pair from text samples. -Recommended model list: [

      -
      -

      ‘alibaba-pai/pai-llama3-8b-doc2qa’, -‘alibaba-pai/pai-baichuan2-7b-doc2qa’, -‘alibaba-pai/pai-qwen1_5-4b-doc2qa’, -‘alibaba-pai/pai-qwen1_5-7b-doc2qa’, -‘alibaba-pai/pai-qwen1_5-1b8-doc2qa’, -‘alibaba-pai/pai-qwen1_5-0b5-doc2qa’

      -
      -

      ] -These recommended models are all trained with Chinese data -and are suitable for Chinese.

      +

      Mapper to remove words with incorrect substrings.

      -
      -__init__(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', trust_remote_code: bool = False, pattern: str | None = None, qa_format: str = 'chatml', enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      -

      Initialization method. -:param hf_model: Hugginface model id. -:param trust_remote_code: passed to transformers -:param pattern: regular expression pattern to search for within text. -:param qa_format: Output format of question and answer pair. -:param enable_vllm: Whether to use vllm for inference acceleration. -:param tensor_parallel_size: It is only valid when enable_vllm is True.

      -
      -

      The number of GPUs to use for distributed execution with tensor -parallelism.

      -
      +
      +__init__(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]
      +

      Initialization method.

      Parameters:
        -
      • max_model_len – It is only valid when enable_vllm is True. -Model context length. If unspecified, will be automatically -derived from the model config.

      • -
      • max_num_seqs – It is only valid when enable_vllm is True. -Maximum number of sequences to be processed in a single iteration.

      • -
      • sampling_params – Sampling parameters for text generation. -e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

      • +
      • lang – sample in which language

      • +
      • tokenization – whether to use model to tokenize documents

      • +
      • substrings – The incorrect substrings in words.

      • args – extra args

      • kwargs – extra args

      -

      The default data format parsed by this interface is as follows: -Model Input:

      -
      -

      蒙古国的首都是乌兰巴托(Ulaanbaatar) -冰岛的首都是雷克雅未克(Reykjavik)

      -
      -
      -
      Model Output:

      蒙古国的首都是乌兰巴托(Ulaanbaatar) -冰岛的首都是雷克雅未克(Reykjavik) -Human: 请问蒙古国的首都是哪里? -Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 -Human: 冰岛的首都是哪里呢? -Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 -…

      -
      -
      -
      -process_single(sample, rank=None)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      -
      -
      -
      +
      +should_keep_word_with_incorrect_substrings(word, substrings)[source]
      +
      + +
      +
      +process_batched(samples)[source]
      +
      -
      -class data_juicer.ops.mapper.ImageCaptioningMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.ReplaceContentMapper(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to generate samples whose captions are generated based on -another model and the figure.

      +

      Mapper to replace all content in the text that matches +a specific regular expression pattern with a designated +replacement string.

      -
      -__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
      +
      +__init__(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • hf_img2seq – model name on huggingface to generate caption

      • -
      • caption_num – how many candidate captions to generate -for each image

      • -
      • keep_candidate_mode

        retain strategy for the generated -$caption_num$ candidates.

        -

        ’random_any’: Retain the random one from generated captions

        -
        -
        ’similar_one_simhash’: Retain the generated one that is most

        similar to the original caption

        -
        -
        -

        ’all’: Retain all generated captions by concatenation

        -

      • -
      -
      -
      -
      -

      Note

      -

      This is a batched_OP, whose input and output type are -both list. Suppose there are $N$ list of input samples, whose batch -size is $b$, and denote caption_num as $M$. -The number of total samples after generation is $2Nb$ when -keep_original_sample is True and $Nb$ when keep_original_sample is -False. For ‘random_any’ and ‘similar_one_simhash’ mode, -it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True -and $MNb$ when keep_original_sample is False.

      -
      -
      -
      Parameters:
      -
        -
      • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only generated captions in the -final datasets and the original captions will be removed. It’s True -in default.

      • -
      • prompt – a string prompt to guide the generation of blip2 model -for all samples globally. It’s None in default, which means no -prompt provided.

      • -
      • prompt_key – the key name of fields in samples to store prompts -for each sample. It’s used for set different prompts for different -samples. If it’s none, use prompt in parameter “prompt”. It’s None -in default.

      • +
      • pattern – regular expression pattern(s) to search for within text

      • +
      • repl – replacement string(s), default is empty string

      • args – extra args

      • kwargs – extra args

      @@ -1438,44 +1500,56 @@
      -
      -process_batched(samples, rank=None)[source]
      -
      -

      Note

      -

      This is a batched_OP, whose input and output type are -both list. Suppose there are $N$ input sample list with batch -size as $b$, and denote caption_num as $M$. -the number of total samples after generation is $2Nb$ -for ‘random_any’ and ‘similar_one’ mode, -and $(1+M)Nb$ for ‘all’ mode.

      -
      +
      +process_batched(samples)[source]
      +
      + +
      + +
      +
      +class data_juicer.ops.mapper.SentenceSplitMapper(lang: str = 'en', *args, **kwargs)[source]
      +

      Bases: Mapper

      +

      Mapper to split text samples to sentences.

      +
      +
      +__init__(lang: str = 'en', *args, **kwargs)[source]
      +

      Initialization method.

      Parameters:
      -

      samples

      -
      -
      Returns:
      -

      +
        +
      • lang – split sentence of text in which language.

      • +
      • args – extra args

      • +
      • kwargs – extra args

      • +
      +
      +
      +process_batched(samples)[source]
      +
      +
      -
      -class data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.VideoCaptioningFromAudioMapper(keep_original_sample: bool = True, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to remove words with incorrect substrings.

      +

      Mapper to caption a video according to its audio streams based on +Qwen-Audio model.

      -
      -__init__(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]
      +
      +__init__(keep_original_sample: bool = True, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • lang – sample in which language

      • -
      • tokenization – whether to use model to tokenize documents

      • -
      • substrings – The incorrect substrings in words.

      • +
      • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only captioned sample in the +final datasets and the original sample will be removed. It’s True +in default.

      • args – extra args

      • kwargs – extra args

      @@ -1484,32 +1558,27 @@
      -
      -should_keep_word_with_incorrect_substrings(word, substrings)[source]
      -
      - -
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples, rank=None)[source]
      -
      -class data_juicer.ops.mapper.VideoCaptioningFromVideoMapper(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.VideoCaptioningFromFramesMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

      Bases: Mapper

      Mapper to generate samples whose captions are generated based on -a video-to-text model and sampled video frame.

      +an image-to-text model and sampled video frames. Captions from different +frames will be concatenated to a single string.

      -
      -__init__(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
      +
      +__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • hf_video_blip – video-blip model name on huggingface -to generate caption

      • +
      • hf_img2seq – model name on huggingface to generate caption

      • caption_num – how many candidate captions to generate for each video

      • keep_candidate_mode

        retain strategy for the generated @@ -1542,7 +1611,7 @@ it’s set to False, there will be only generated captions in the final datasets and the original captions will be removed. It’s True in default.

      • -
      • prompt – a string prompt to guide the generation of video-blip +

      • prompt – a string prompt to guide the generation of image-to-text model for all samples globally. It’s None in default, which means no prompt provided.

      • prompt_key – the key name of fields in samples to store prompts @@ -1572,8 +1641,8 @@

      -
      -process_batched(samples, rank=None, context=False)[source]
      +
      +process_batched(samples, rank=None, context=False)[source]
      Parameters:

      samples

      @@ -1583,316 +1652,67 @@
      -

      Note

      -

      This is a batched_OP, whose the input and output type are -both list. Suppose there are $N$ input sample list with batch -size as $b$, and denote caption_num as $M$. -the number of total samples after generation is $2Nb$ -for ‘random_any’ and ‘similar_one’ mode, -and $(1+M)Nb$ for ‘all’ mode.

      -
      -
      - -
      - -
      -
      -class data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to generate video captions by summarizing several kinds of generated -texts (captions from video/audio/frames, tags from audio/frames, …)

      -
      -
      -__init__(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]
      -

      Initialization method.

      -
      -
      Parameters:
      -
        -
      • hf_summarizer – the summarizer model used to summarize texts -generated by other methods.

      • -
      • consider_video_caption_from_video – whether to consider the video -caption generated from video directly in the summarization process. -Default: True.

      • -
      • consider_video_caption_from_audio – whether to consider the video -caption generated from audio streams in the video in the -summarization process. Default: True.

      • -
      • consider_video_caption_from_frames – whether to consider the -video caption generated from sampled frames from the video in the -summarization process. Default: True.

      • -
      • consider_video_tags_from_audio – whether to consider the video -tags generated from audio streams in the video in the summarization -process. Default: True.

      • -
      • consider_video_tags_from_frames – whether to consider the video -tags generated from sampled frames from the video in the -summarization process. Default: True.

      • -
      • vid_cap_from_vid_args – the arg dict for video captioning from -video directly with keys are the arg names and values are the arg -values. Default: None.

      • -
      • vid_cap_from_frm_args – the arg dict for video captioning from -sampled frames from the video with keys are the arg names and -values are the arg values. Default: None.

      • -
      • vid_tag_from_aud_args – the arg dict for video tagging from audio -streams in the video with keys are the arg names and values are the -arg values. Default: None.

      • -
      • vid_tag_from_frm_args – the arg dict for video tagging from -sampled frames from the video with keys are the arg names and -values are the arg values. Default: None.

      • -
      • keep_tag_num – max number N of tags from sampled frames to keep. -Too many tags might bring negative influence to summarized text, so -we consider to only keep the N most frequent tags. Default: 5.

      • -
      • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only summarized captions in the -final datasets and the original captions will be removed. It’s True -in default.

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      -
      - -
      -
      -process_batched(samples, rank=None)[source]
      -
      - -
      - -
      -
      -class data_juicer.ops.mapper.GenerateInstructionMapper(hf_model: str = 'Qwen/Qwen-7B-Chat', seed_file: str = '', instruct_num: int[int] = 3, trust_remote_code: bool = False, similarity_threshold: float = 0.7, prompt_template: str | None = None, qa_pair_template: str | None = None, example_template: str | None = None, qa_extraction_pattern: str | None = None, enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to generate new instruction text data. -You should configure an empty dataset in your yaml config file: -``` -generated_dataset_config:

      -
      -

      type: ‘EmptyFormatter’ # use RayEmptyFormatter when enable ray -length: ${The number of generated samples} -feature_keys: ${text key}

      -
      -

      ``` -The number of samples generated is determined by -the length of the empty dataset.

      -
      -
      -__init__(hf_model: str = 'Qwen/Qwen-7B-Chat', seed_file: str = '', instruct_num: int[int] = 3, trust_remote_code: bool = False, similarity_threshold: float = 0.7, prompt_template: str | None = None, qa_pair_template: str | None = None, example_template: str | None = None, qa_extraction_pattern: str | None = None, enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      -
      -

      Initialization method.

      -
      -
      param hf_model:
      -

      Hugginface model id.

      -
      -
      param seed_file:
      -

      Seed file path, chatml format.

      -
      -
      param instruct_num:
      -

      The number of instruction samples. -Randomly select N samples from “seed_file” and -put them into prompt as instruction samples.

      -
      -
      param trust_remote_code:
      -

      passed to transformers

      -
      -
      param similarity_threshold:
      -

      The similarity score threshold -between the generated samples and the seed samples. -Range from 0 to 1. Samples with similarity score less than -this threshold will be kept.

      -
      -
      param prompt_template:
      -

      Prompt template for generate samples. -Please make sure the template contains “{augmented_data}”, -which corresponds to the augmented samples.

      -
      -
      param qa_pair_template:
      -

      Prompt template for generate question -and answer pair description. Please make sure the template -contains two “{}” to format question and answer. -Default: ‘【问题】

      -
      -
      -
      -

      {} -【回答】 -{} -‘.

      -
      -
      -
      param example_template:
      -

      Prompt template for generate examples. -Please make sure the template contains “{qa_pairs}”, which -corresponds to the question and answer pair description -generated by param qa_pair_template. -Default: ‘

      -
      -
      -
      -

      如下是一条示例数据:

      -
      -
      {qa_pairs}’
      -
      param qa_extraction_pattern:
      -

      Regular expression pattern for parsing -question and answer from model response.

      -
      -
      param enable_vllm:
      -

      Whether to use vllm for inference acceleration.

      -
      -
      param tensor_parallel_size:
      -

      It is only valid when enable_vllm is True. -The number of GPUs to use for distributed execution with tensor -parallelism.

      -
      -
      param max_model_len:
      -

      It is only valid when enable_vllm is True. -Model context length. If unspecified, will be automatically -derived from the model config.

      -
      -
      param max_num_seqs:
      -

      It is only valid when enable_vllm is True. -Maximum number of sequences to be processed in a single iteration.

      -
      -
      param sampling_params:
      -

      Sampling parameters for text generation. -e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

      -
      -
      param args:
      -

      extra args

      -
      -
      param kwargs:
      -

      extra args

      -
      -
      -
      -
      -
      - -
      -
      -load_seed_qa_samples(seed_file)[source]
      -

      Load QA pairs from chatml format file.

      -
      - -
      -
      -build_prompt(qa_samples, prompt_template)[source]
      -
      - -
      -
      -parse_chatml_str(input_str)[source]
      -
      - -
      -
      -parse_response(response_str)[source]
      -
      - -
      -
      -max_rouge_l_score(reference, candidates)[source]
      -
      - -
      -
      -process_single(sample=None, rank=None)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      -
      -
      -
      - -
      - -
      -
      -class data_juicer.ops.mapper.FixUnicodeMapper(normalization: str | None = None, *args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to fix unicode errors in text samples.

      -
      -
      -__init__(normalization: str | None = None, *args, **kwargs)[source]
      -

      Initialization method.

      -
      -
      Parameters:
      -
        -
      • normalization – the specified form of Unicode -normalization mode, which can be one of -[‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’], default ‘NFC’.

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      +

      Note

      +

      This is a batched_OP, whose the input and output type are +both list. Suppose there are $N$ input sample list with batch +size as $b$, and denote caption_num as $M$. +the number of total samples after generation is $2Nb$ +for ‘random_any’ and ‘similar_one’ mode, +and $(1+M)Nb$ for ‘all’ mode.

      +
    -
    -
    -process_batched(samples)[source]
    -
    -
    -
    -class data_juicer.ops.mapper.NlpaugEnMapper(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to simply augment samples in English based on nlpaug library.

    +

    Mapper to generate video captions by summarizing several kinds of generated +texts (captions from video/audio/frames, tags from audio/frames, …)

    -
    -__init__(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
    -

    Initialization method. All augmentation methods use default parameters -in default. We recommend you to only use 1-3 augmentation methods at a -time. Otherwise, the semantics of samples might be changed -significantly.

    +
    +__init__(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]
    +

    Initialization method.

    Parameters:
      -
    • sequential – whether combine all augmentation methods to a -sequence. If it’s True, a sample will be augmented by all opened -augmentation methods sequentially. If it’s False, each opened -augmentation method would generate its augmented samples -independently.

    • -
    • aug_num – number of augmented samples to be generated. If -sequential is True, there will be total aug_num augmented samples -generated. If it’s False, there will be (aug_num * -#opened_aug_method) augmented samples generated.

    • +
    • hf_summarizer – the summarizer model used to summarize texts +generated by other methods.

    • +
    • consider_video_caption_from_video – whether to consider the video +caption generated from video directly in the summarization process. +Default: True.

    • +
    • consider_video_caption_from_audio – whether to consider the video +caption generated from audio streams in the video in the +summarization process. Default: True.

    • +
    • consider_video_caption_from_frames – whether to consider the +video caption generated from sampled frames from the video in the +summarization process. Default: True.

    • +
    • consider_video_tags_from_audio – whether to consider the video +tags generated from audio streams in the video in the summarization +process. Default: True.

    • +
    • consider_video_tags_from_frames – whether to consider the video +tags generated from sampled frames from the video in the +summarization process. Default: True.

    • +
    • vid_cap_from_vid_args – the arg dict for video captioning from +video directly with keys are the arg names and values are the arg +values. Default: None.

    • +
    • vid_cap_from_frm_args – the arg dict for video captioning from +sampled frames from the video with keys are the arg names and +values are the arg values. Default: None.

    • +
    • vid_tag_from_aud_args – the arg dict for video tagging from audio +streams in the video with keys are the arg names and values are the +arg values. Default: None.

    • +
    • vid_tag_from_frm_args – the arg dict for video tagging from +sampled frames from the video with keys are the arg names and +values are the arg values. Default: None.

    • +
    • keep_tag_num – max number N of tags from sampled frames to keep. +Too many tags might bring negative influence to summarized text, so +we consider to only keep the N most frequent tags. Default: 5.

    • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only generated texts in the final -datasets and the original texts will be removed. It’s True in -default.

    • -
    • delete_random_word – whether to open the augmentation method of -deleting random words from the original texts. e.g. “I love LLM” -–> “I LLM”

    • -
    • swap_random_word – whether to open the augmentation method of -swapping random contiguous words in the original texts. e.g. “I -love LLM” –> “Love I LLM”

    • -
    • spelling_error_word – whether to open the augmentation method of -simulating the spelling error for words in the original texts. e.g. -“I love LLM” –> “Ai love LLM”

    • -
    • split_random_word – whether to open the augmentation method of -splitting words randomly with whitespaces in the original texts. -e.g. “I love LLM” –> “I love LL M”

    • -
    • keyboard_error_char – whether to open the augmentation method of -simulating the keyboard error for characters in the original texts. -e.g. “I love LLM” –> “I ;ov4 LLM”

    • -
    • ocr_error_char – whether to open the augmentation method of -simulating the OCR error for characters in the original texts. -e.g. “I love LLM” –> “I 10ve LLM”

    • -
    • delete_random_char – whether to open the augmentation method of -deleting random characters from the original texts. e.g. “I love -LLM” –> “I oe LLM”

    • -
    • swap_random_char – whether to open the augmentation method of -swapping random contiguous characters in the original texts. -e.g. “I love LLM” –> “I ovle LLM”

    • -
    • insert_random_char – whether to open the augmentation method of -inserting random characters into the original texts. e.g. “I love -LLM” –> “I ^lKove LLM”

    • +it’s set to False, there will be only summarized captions in the +final datasets and the original captions will be removed. It’s True +in default.

    • args – extra args

    • kwargs – extra args

    @@ -1901,27 +1721,27 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples, rank=None)[source]
    -
    -class data_juicer.ops.mapper.VideoCaptioningFromFramesMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoCaptioningFromVideoMapper(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate samples whose captions are generated based on -an image-to-text model and sampled video frames. Captions from different -frames will be concatenated to a single string.

    +a video-to-text model and sampled video frame.

    -
    -__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
    +
    +__init__(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_img2seq – model name on huggingface to generate caption

    • +
    • hf_video_blip – video-blip model name on huggingface +to generate caption

    • caption_num – how many candidate captions to generate for each video

    • keep_candidate_mode

      retain strategy for the generated @@ -1954,7 +1774,7 @@ it’s set to False, there will be only generated captions in the final datasets and the original captions will be removed. It’s True in default.

    • -
    • prompt – a string prompt to guide the generation of image-to-text +

    • prompt – a string prompt to guide the generation of video-blip model for all samples globally. It’s None in default, which means no prompt provided.

    • prompt_key – the key name of fields in samples to store prompts @@ -1984,8 +1804,8 @@

    -
    -process_batched(samples, rank=None, context=False)[source]
    +
    +process_batched(samples, rank=None, context=False)[source]
    Parameters:

    samples

    @@ -2008,21 +1828,172 @@
    -
    -class data_juicer.ops.mapper.RemoveLongWordsMapper(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Simple wrapper for FFmpeg video filters.

    +
    +
    +__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • filter_name – ffmpeg video filter name.

    • +
    • filter_kwargs – keyword-arguments passed to ffmpeg filter.

    • +
    • global_args – list-arguments passed to ffmpeg command-line.

    • +
    • capture_stderr – whether to capture stderr.

    • +
    • overwrite_output – whether to overwrite output file.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.VideoFaceBlurMapper(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Mapper to blur faces detected in videos.

    +
    +
    +__init__(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • cv_classifier – OpenCV classifier path for face detection. +By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • +
    • blur_type – Type of blur kernel, including +[‘mean’, ‘box’, ‘gaussian’].

    • +
    • radius – Radius of blur kernel.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_single(sample, context=False)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.VideoRemoveWatermarkMapper(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Remove the watermarks in videos given regions.

    +
    +
    +__init__(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • roi_strings – a given list of regions the watermarks locate. +The format of each can be “x1, y1, x2, y2”, “(x1, y1, x2, y2)”, +or “[x1, y1, x2, y2]”.

    • +
    • roi_type – the roi string type. When the type is ‘pixel’, (x1, +y1), (x2, y2) are the locations of pixels in the top left corner +and the bottom right corner respectively. If the roi_type is +‘ratio’, the coordinates are normalized by wights and heights.

    • +
    • roi_key – the key name of fields in samples to store roi_strings +for each sample. It’s used for set different rois for different +samples. If it’s none, use rois in parameter “roi_strings”. +It’s None in default.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video to detect the pixels of watermark.

    • +
    • min_frame_threshold – a coodination is considered as the +location of a watermark pixel when it is that in no less +min_frame_threshold frames.

    • +
    • detection_method – the method to detect the pixels of watermark. +If it is ‘pixel_value’, we consider the distribution of pixel +value in each frame. If it is ‘pixel_diversity’, we will consider +the pixel diversity in different frames. The min_frame_threshold +is useless and frame_num must be greater than 1 in +‘pixel_diversity’ mode.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_single(sample, context=False)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.VideoResizeAspectRatioMapper(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove long words within a specific range.

    +

    Mapper to resize videos by aspect ratio. +AspectRatio = W / H.

    +
    +
    +STRATEGY = ['decrease', 'increase']
    +
    +
    -
    -__init__(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_len – The min mapper word length in this op, words -will be filtered if their length is below this parameter.

    • -
    • max_len – The max mapper word length in this op, words -will be filtered if their length exceeds this parameter.

    • +
    • min_ratio – The minimum aspect ratio to enforce videos with +an aspect ratio below min_ratio will be resized to match +this minimum ratio. The ratio should be provided as a string +in the format “9:21” or “9/21”.

    • +
    • max_ratio – The maximum aspect ratio to enforce videos with +an aspect ratio above max_ratio will be resized to match +this maximum ratio. The ratio should be provided as a string +in the format “21:9” or “21/9”.

    • +
    • strategy – The resizing strategy to apply when adjusting the +video dimensions. It can be either ‘decrease’ to reduce the +dimension or ‘increase’ to enlarge it. Accepted values are +[‘decrease’, ‘increase’].

    • args – extra args

    • kwargs – extra args

    @@ -2031,14 +2002,18 @@
    -
    -should_keep_long_word(word)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    @@ -2089,19 +2064,25 @@
    -
    -class data_juicer.ops.mapper.CleanEmailMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoSplitByDurationMapper(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to clean email in text samples.

    +

    Mapper to split video by duration.

    -
    -__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +
    +__init__(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • pattern – regular expression pattern to search for within text.

    • -
    • repl – replacement string, default is empty string.

    • +
    • split_duration – duration of each video split in seconds.

    • +
    • min_last_split_duration – The minimum allowable duration in +seconds for the last video split. If the duration of the last +split is less than this value, it will be discarded.

    • +
    • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only cut sample in the +final datasets and the original sample will be removed. It’s True +in default.

    • args – extra args

    • kwargs – extra args

    @@ -2110,28 +2091,33 @@
    -
    -process_batched(samples)[source]
    +
    +split_videos_by_duration(video_key, container)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.ReplaceContentMapper(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoSplitByKeyFrameMapper(keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to replace all content in the text that matches -a specific regular expression pattern with a designated -replacement string.

    +

    Mapper to split video by key frame.

    -
    -__init__(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]
    +
    +__init__(keep_original_sample: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • pattern – regular expression pattern(s) to search for within text

    • -
    • repl – replacement string(s), default is empty string

    • +
    • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only split sample in the +final datasets and the original sample will be removed. It’s True +in default.

    • args – extra args

    • kwargs – extra args

    @@ -2140,29 +2126,39 @@
    -
    -process_batched(samples)[source]
    +
    +get_split_key_frame(video_key, container)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.AudioFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoSplitBySceneMapper(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    -

    Simple wrapper for FFmpeg audio filters.

    +

    Mapper to cut videos into scene clips.

    +
    +
    +avaliable_detectors = {'AdaptiveDetector': ['window_width', 'min_content_val', 'weights', 'luma_only', 'kernel_size', 'video_manager', 'min_delta_hsv'], 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], 'ThresholdDetector': ['fade_bias', 'add_final_scene', 'method', 'block_size']}
    +
    +
    -
    -__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +
    +__init__(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • filter_name – ffmpeg audio filter name.

    • -
    • filter_kwargs – keyword-arguments passed to ffmpeg filter.

    • -
    • global_args – list-arguments passed to ffmpeg command-line.

    • -
    • capture_stderr – whether to capture stderr.

    • -
    • overwrite_output – whether to overwrite output file.

    • +
    • detector – Algorithm from scenedetect.detectors. Should be one +of [‘ContentDetector’, ‘ThresholdDetector’, ‘AdaptiveDetector`].

    • +
    • threshold – Threshold passed to the detector.

    • +
    • min_scene_len – Minimum length of any scene.

    • +
    • show_progress – Whether to show progress from scenedetect.

    • args – extra args

    • kwargs – extra args

    @@ -2171,8 +2167,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2187,25 +2183,22 @@
    -
    -class data_juicer.ops.mapper.VideoSplitByDurationMapper(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoTaggingFromAudioMapper(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to split video by duration.

    +

    Mapper to generate video tags from audio streams extracted by video +using the Audio Spectrogram Transformer.

    -
    -__init__(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]
    +
    +__init__(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • split_duration – duration of each video split in seconds.

    • -
    • min_last_split_duration – The minimum allowable duration in -seconds for the last video split. If the duration of the last -split is less than this value, it will be discarded.

    • -
    • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only cut sample in the -final datasets and the original sample will be removed. It’s True -in default.

    • +
    • hf_ast – path to the HF model to tag from audios.

    • +
    • trust_remote_code – whether to trust the remote code of HF models

    • +
    • tag_field_name – the field name to store the tags. It’s +“__dj__video_audio_tags__” in default.

    • args – extra args

    • kwargs – extra args

    @@ -2214,34 +2207,48 @@
    -
    -split_videos_by_duration(video_key, container)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    +
    +process_single(sample, rank=None)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    -
    -class data_juicer.ops.mapper.VideoFaceBlurMapper(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoTaggingFromFramesMapper(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to blur faces detected in videos.

    +

    Mapper to generate video tags from frames extract by video.

    -
    -__init__(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +
    +__init__(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • cv_classifier – OpenCV classifier path for face detection. -By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • -
    • blur_type – Type of blur kernel, including -[‘mean’, ‘box’, ‘gaussian’].

    • -
    • radius – Radius of blur kernel.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. Should be one of +[“all_keyframes”, “uniform”]. +The former one extracts all key frames (the number of which depends +on the duration of the video) and the latter one extract specified +number of frames uniformly from the video. +Default: “all_keyframes”.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • tag_field_name – the field name to store the tags. It’s +“__dj__video_frame_tags__” in default.

    • args – extra args

    • kwargs – extra args

    @@ -2250,8 +2257,8 @@
    -
    -process_single(sample, context=False)[source]
    +
    +process_single(sample, rank=None, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2266,18 +2273,17 @@
    -
    -class data_juicer.ops.mapper.ImageTaggingMapper(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.WhitespaceNormalizationMapper(*args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to generate image tags.

    +

    Mapper to normalize different kinds of whitespaces to whitespace ‘ ‘ (0x20) +in text samples.

    +

    Different kinds of whitespaces can be found here: +https://en.wikipedia.org/wiki/Whitespace_character

    -
    -__init__(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]
    -

    Initialization method. -:param tag_field_name: the field name to store the tags. It’s

    -
    -

    “__dj__image_tags__” in default.

    -
    +
    +__init__(*args, **kwargs)[source]
    +

    Initialization method.

    Parameters:
      @@ -2289,18 +2295,9 @@
    -
    -process_single(sample, rank=None, context=False)[source]
    -

    For sample level, sample –> sample

    -
    -
    Parameters:
    -

    sample – sample to process

    -
    -
    Returns:
    -

    processed sample

    -
    -
    -
    +
    +process_batched(samples)[source]
    +
    diff --git a/genindex.html b/genindex.html index 71efd4518..a34a5abdb 100644 --- a/genindex.html +++ b/genindex.html @@ -270,12 +270,12 @@

    _

  • (data_juicer.ops.mapper.CleanLinksMapper method)
  • (data_juicer.ops.mapper.ExpandMacroMapper method) -
  • -
  • (data_juicer.ops.mapper.ExtractQAMapper method)
  • (data_juicer.ops.mapper.FixUnicodeMapper method)
  • -
  • (data_juicer.ops.mapper.GenerateInstructionMapper method) +
  • (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method) +
  • +
  • (data_juicer.ops.mapper.GenerateQAFromTextMapper method)
  • (data_juicer.ops.mapper.ImageBlurMapper method)
  • @@ -293,7 +293,7 @@

    _

  • (data_juicer.ops.mapper.NlpcdaZhMapper method)
  • -
  • (data_juicer.ops.mapper.OptimizeInstructionMapper method) +
  • (data_juicer.ops.mapper.OptimizeQAMapper method)
  • (data_juicer.ops.mapper.PunctuationNormalizationMapper method)
  • @@ -412,8 +412,12 @@

    B

    @@ -635,8 +639,6 @@

    D

  • module
  • - -
    • data_juicer.ops.mapper @@ -651,6 +653,8 @@

      D

    • module
    + +
    • data_juicer.tools @@ -667,6 +671,36 @@

      D

  • Deduplicator (class in data_juicer.ops)
  • +
  • DEFAULT_EXAMPLE_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) +
  • +
  • DEFAULT_INPUT_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) + +
  • +
  • DEFAULT_OUTPUT_PATTERN (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) + +
  • +
  • DEFAULT_QA_PAIR_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) + +
  • +
  • DEFAULT_SYSTEM_PROMPT (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) + +
  • DiversityAnalysis (class in data_juicer.analysis)
  • DocumentDeduplicator (class in data_juicer.ops.deduplicator) @@ -694,11 +728,11 @@

    E

  • execute_and_probe() (data_juicer.core.Adapter static method)
  • Executor (class in data_juicer.core) -
  • -
  • ExpandMacroMapper (class in data_juicer.ops.mapper)
  • @@ -735,14 +767,16 @@

    F

    G

      +
    • get_sentences_from_document() (in module data_juicer.ops.common) +
    • get_split_key_frame() (data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method)
    • get_words_from_document() (in module data_juicer.ops.common) @@ -842,8 +876,6 @@

      L

    • load_from_disk() (data_juicer.core.NestedDataset static method)
    • load_ops() (in module data_juicer.ops) -
    • -
    • load_seed_qa_samples() (data_juicer.ops.mapper.GenerateInstructionMapper method)
    • LocalFormatter (class in data_juicer.format)
    • @@ -858,8 +890,6 @@

      M

    • Mapper (class in data_juicer.ops)
    • MAX_BATCH_SIZE (data_juicer.core.Adapter attribute) -
    • -
    • max_rouge_l_score() (data_juicer.ops.mapper.GenerateInstructionMapper method)
    • MaximumLineLengthFilter (class in data_juicer.ops.filter)
    • @@ -938,10 +968,14 @@

      N

      O

      @@ -952,10 +986,18 @@

      P

      • ParquetFormatter (class in data_juicer.format)
      • -
      • parse_chatml_str() (data_juicer.ops.mapper.GenerateInstructionMapper method) +
      • parse_output() (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method) + +
      • PerplexityFilter (class in data_juicer.ops.filter)
      • PhraseGroundingRecallFilter (class in data_juicer.ops.filter) @@ -1026,6 +1068,8 @@

        P

      • (data_juicer.ops.mapper.ExpandMacroMapper method)
      • (data_juicer.ops.mapper.FixUnicodeMapper method) +
      • +
      • (data_juicer.ops.mapper.GenerateQAFromTextMapper method)
      • (data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper method)
      • @@ -1155,9 +1199,7 @@

        P

      • (data_juicer.ops.mapper.AudioFFmpegWrappedMapper method)
      • -
      • (data_juicer.ops.mapper.ExtractQAMapper method) -
      • -
      • (data_juicer.ops.mapper.GenerateInstructionMapper method) +
      • (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
      • (data_juicer.ops.mapper.ImageBlurMapper method)
      • @@ -1165,7 +1207,7 @@

        P

      • (data_juicer.ops.mapper.ImageTaggingMapper method)
      • -
      • (data_juicer.ops.mapper.OptimizeInstructionMapper method) +
      • (data_juicer.ops.mapper.OptimizeQAMapper method)
      • (data_juicer.ops.mapper.VideoFaceBlurMapper method)
      • diff --git a/index.html b/index.html index 63647e7b5..3eaf19f33 100644 --- a/index.html +++ b/index.html @@ -104,111 +104,113 @@

        Tutorialdata_juicer.ops.filter
      • data_juicer.ops.mapper
      • data_juicer.ops.deduplicator
      • data_juicer.ops.selector
          diff --git a/objects.inv b/objects.inv index 3ef7dcac8..f6f555efe 100644 Binary files a/objects.inv and b/objects.inv differ diff --git a/searchindex.js b/searchindex.js index 0a6106f31..3c211b572 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7, 8, 9], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 3, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 3, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 3, 7, 8, 9], "inform": [1, 3, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 8, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": [2, 9], "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": [2, 3], "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 3, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 3, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": [2, 3, 4, 9], "origin": [2, 3, 8, 9], "expect": [2, 3, 9], "cfg_after_merg": 2, "adapt": [3, 13], "max_batch_s": 3, "10000": 3, "static": 3, "execute_and_prob": 3, "oper": 3, "sample_interv": 3, "0": [3, 4, 5, 7, 8, 9], "5": [3, 7, 8, 9], "input": [3, 5, 7, 8, 9, 10], "probe": 3, "relat": [3, 8], "op": [3, 13], "specifi": [3, 4, 6, 8, 9, 10], "For": [3, 5, 7, 8, 9], "now": [3, 6, 9], "we": [3, 4, 7, 8, 9, 13], "support": [3, 8, 9], "follow": [3, 9], "target": [3, 8, 10], "resourc": 3, "util": 3, "speed": 3, "averag": [3, 8], "The": [3, 4, 5, 8, 9, 10], "item": [3, 5], "take_batch": 3, "config": [3, 5, 9, 13], "split": [3, 6, 9], "batch": [3, 9], "factor": 3, "set": [3, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "iter": [3, 8, 9], "adapt_workload": 3, "manag": 3, "schedul": 3, "balanc": 3, "need": [3, 6, 8, 9, 10], "recip": 3, "probe_small_batch": 3, "perform": 3, "small": [3, 8], "pre": 3, "execut": [3, 9], "avail": [3, 8], "current": 3, "estim": 3, "rank": [3, 8, 9, 10], "A": [3, 5, 7, 9], "length": [3, 4, 8, 9], "batch_size_strategi": 3, "load_analysis_r": 3, "base_b": 3, "util_th": 3, "9": [3, 8, 9], "decid": [3, 5, 7, 8], "accord": [3, 4, 5, 8, 9], "workload": 3, "analysi": [3, 13], "threshold": [3, 7, 8, 9], "guarante": 3, "won": [3, 7], "t": [3, 4, 6, 7], "exce": [3, 8, 9], "onli": [3, 7, 8, 9], "consid": [3, 7, 8, 9], "bucket": 3, "effect": 3, "which": [3, 5, 7, 8, 9], "max": [3, 4, 7, 8, 9], "except": [3, 9], "gpu": [3, 9], "thi": [3, 4, 5, 6, 7, 8, 9, 10], "It": [3, 4, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "work_dir": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 9, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "load_from_disk": 3, "wa": 3, "previous": 3, "save_to_disk": 3, "directori": [3, 4, 8], "filesystem": 3, "ani": [3, 8, 9], "implement": [3, 7], "fsspec": 3, "spec": 3, "abstractfilesystem": 3, "dataset_path": [3, 4], "str": [3, 4, 6, 7, 8, 9, 10], "train": [3, 9], "remot": [3, 9], "uri": 3, "s3": 3, "my": 3, "where": 3, "f": [3, 4], "instanc": [3, 5], "deprec": 3, "version": [3, 9], "2": [3, 6, 8, 9], "8": [3, 8, 9], "3": [3, 8, 9], "pleas": [3, 7, 9], "storage_opt": 3, "instead": [3, 4, 6], "keep_in_memori": 3, "bool": [3, 7, 8, 9, 10], "copi": 3, "memori": 3, "unless": 3, "explicitli": 3, "enabl": [3, 9], "in_memory_max_s": 3, "nonzero": 3, "see": [3, 13], "more": [3, 8, 9, 13], "detail": [3, 13], "improv": 3, "section": 3, "kei": [3, 4, 5, 8, 9, 10], "pair": [3, 5, 7, 8, 9], "pass": [3, 9], "system": [3, 9], "backend": 3, "ad": [3, 6, 9], "request": [3, 9], "datasetdict": 3, "exampl": [3, 8, 9], "py": [3, 4], "d": [3, 4], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 7, 8, 9, 10], "sample_algo": 3, "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "ratio": [3, 4, 6, 8, 9, 10], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "to_jsonl": 3, "jsonl": [3, 4], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "monitor": [3, 13], "other": [3, 8, 9], "dure": 3, "python": 3, "time": [3, 9], "10": [3, 8, 9], "timestamp": 3, "xxx": 3, "cpu": 3, "count": [3, 8], "free": 3, "mem": 3, "structur": 3, "abov": [3, 9], "field": [3, 4, 5, 7, 8, 9, 10], "first": [3, 6, 7, 8, 9], "level": [3, 5, 6, 7, 8, 9, 10], "resource_analysi": 3, "min": [3, 7, 8, 9], "avg": [3, 8], "those": [3, 8], "dynamic_field": 3, "monitor_all_resourc": 3, "detect": [3, 7, 8, 9], "node": 3, "monitor_current_resourc": 3, "machin": 3, "rang": [3, 8, 9, 10], "mb": [3, 8], "analyze_resource_util_list": 3, "resource_util_list": 3, "metric": [3, 5, 7, 8], "analyze_single_resource_util": 3, "resource_util_dict": 3, "monitor_func": 3, "show_num": [3, 5, 7], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "two": [3, 7, 8, 9], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "generated_dataset_config": [4, 9], "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "creat": 4, "provid": [4, 7, 9], "must": [4, 8, 9], "contain": [4, 6, 8, 9], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 7, 8, 9, 10], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": [4, 8, 9], "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8, 9], "datasset": 4, "dir": 4, "w1": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "random_sampl": 4, "sample_numb": 4, "seed": [4, 9], "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "random": [4, 9, 10], "42": 4, "emptyformatt": [4, 9, 13], "feature_kei": [4, 9], "empti": [4, 7, 9], "featur": 4, "properti": 4, "null_valu": 4, "rayemptyformatt": [4, 9, 13], "rai": [4, 7, 9], "load_op": [5, 13], "process_list": 5, "op_fus": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stats_batch": [5, 8], "process_batch": [5, 8, 9], "compute_stats_singl": [5, 7, 8], "context": [5, 7, 8, 9], "var": [5, 7, 8], "temporarili": [5, 7, 8], "process_singl": [5, 7, 8, 9], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": [6, 9], "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "videodedupl": [7, 13], "consider_text": 7, "exact": 7, "match": [7, 8, 9], "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "6380": 7, "basic": 7, "although": 7, "empty_hash_valu": 7, "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "documentminhashdedupl": [7, 13], "window_s": 7, "ignore_pattern": 7, "num_permut": 7, "256": [7, 9], "jaccard_threshold": 7, "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "kept": [7, 8, 9], "final": [7, 9], "should": [7, 8, 9], "punctuat": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "shingl": 7, "ignor": [7, 9], "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "jaccard": 7, "similar": [7, 8, 9], "regard": 7, "band": 7, "lsh": 7, "determin": [7, 9, 10], "optim": [7, 9], "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "rayimagededupl": [7, 13], "phash": 7, "raydocumentdedupl": [7, 13], "ignore_non_charact": 7, "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentdedupl": [7, 13], "md5": 7, "imagededupl": [7, 13], "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "rayvideodedupl": [7, 13], "imagetextsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "trust_remote_cod": [8, 9], "min_scor": 8, "max_scor": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "any_or_al": [8, 9], "reduce_mod": 8, "within": [8, 9, 10], "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 9, 10], "chunk": 8, "take": 8, "videoaspectratiofilt": [8, 13], "min_ratio": [8, 9], "21": [8, 9], "max_ratio": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "minimum": [8, 9], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "score": [8, 9], "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "9223372036854775807": [8, 9], "total": [8, 9], "hug": 8, "face": [8, 9], "below": [8, 9], "textlengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "specifiednumericfieldfilt": [8, 13], "field_kei": [8, 10], "min_valu": 8, "max_valu": 8, "numer": 8, "multi": [8, 10, 13], "specifiednumericfield": 8, "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "sy": 8, "maxsiz": 8, "videoaestheticsfilt": [8, 13], "hf_scorer_model": 8, "frame_sampling_method": [8, 9], "frame_num": [8, 9], "aesthet": 8, "frame": [8, 9], "predictor": 8, "By": [8, 9], "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "larg": 8, "while": 8, "usual": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "larger": [8, 9, 10], "addit": [8, 9], "durat": [8, 9], "keyword": [8, 9], "perplexityfilt": [8, 13], "lang": [8, 9], "max_ppl": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": [8, 9], "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "confid": 8, "area": 8, "out": 8, "account": 8, "maximumlinelengthfilt": [8, 13], "averagelinelengthfilt": [8, 13], "specifiedfieldfilt": [8, 13], "target_valu": 8, "retain": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "tag_field_nam": [8, 9], "__dj__video_frame_tags__": [8, 9], "shift": 8, "found": [8, 9], "http": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "requir": 8, "equal": [8, 9, 10], "depend": [8, 9], "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "zh": 8, "mini_dependency_num": 8, "edg": 8, "objet": 8, "videoresolutionfilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "resolut": [8, 9], "alphanumericfilt": [8, 13], "25": 8, "alphanumer": 8, "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "watermark": [8, 9], "high": 8, "probabl": [8, 9], "imageaestheticsfilt": [8, 13], "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "home": 8, "runner": 8, "asset": 8, "what": 8, "adopt": 8, "join": 8, "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "imageshapefilt": [8, 13], "shape": 8, "width": [8, 9], "height": [8, 9], "videodurationfilt": [8, 13], "min_dur": 8, "max_dur": 8, "second": [8, 9], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "mini_action_num": 8, "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "here": [8, 9, 13], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videonsfwfilt": [8, 13], "specialcharactersfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "chineseclip": 8, "might": [8, 9], "choic": 8, "imageaspectratiofilt": [8, 13], "333": 8, "audiodurationfilt": [8, 13], "languageidscorefilt": [8, 13], "identif": 8, "suffixfilt": [8, 13], "imagesizefilt": [8, 13], "videowatermarkfilt": [8, 13], "wordsnumfilt": [8, 13], "imagefacecountfilt": [8, 13], "cv_classifi": [8, 9], "min_face_count": 8, "max_face_count": 8, "opencv": [8, 9], "classifi": [8, 9], "haarcascade_frontalface_alt": [8, 9], "imagefaceratiofilt": [8, 13], "largest": [8, 10], "flaggedwordfilt": [8, 13], "045": 8, "flagged_words_dir": 8, "flag": 8, "flagged_word": 8, "wordrepetitionfilt": [8, 13], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "tupl": 8, "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "sequenc": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "imagepairsimilarityfilt": [8, 13], "closedunitinterv": 8, "videocaptioningfromaudiomapp": [9, 13], "keep_original_sampl": 9, "caption": 9, "stream": 9, "qwen": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "__dj__video_audio_tags__": 9, "spectrogram": 9, "transform": 9, "hf": 9, "trust": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "temperatur": 9, "system_prompt": 9, "user_prompt": 9, "user_prompt_kei": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "control": 9, "output": 9, "prompt": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "respons": 9, "guid": 9, "uers_prompt_kei": 9, "punctuationnormalizationmapp": [9, 13], "unicod": 9, "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "latex": 9, "sentencesplitmapp": [9, 13], "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "cut": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "progress": 9, "cleanipmapp": [9, 13], "repl": 9, "clean": 9, "ipv4": 9, "ipv6": 9, "address": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "begin": 9, "drop": 9, "removetabletextmapp": [9, 13], "min_col": 9, "max_col": 9, "20": 9, "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "letter": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "hf_img2seq": 9, "blip2": 9, "opt": 9, "7b": 9, "point": 9, "fp16": 9, "bf16": 9, "branch": 9, "commit": 9, "id": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "produc": 9, "keep_candidate_mod": 9, "caption_num": 9, "candid": 9, "random_ani": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "kernel": 9, "videoffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "nlpcdazhmapp": [9, 13], "sequenti": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "delete_random_char": 9, "swap_random_char": 9, "replace_equivalent_num": 9, "simpli": 9, "nlpcda": 9, "librari": 9, "you": 9, "semant": 9, "significantli": 9, "notic": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "delet": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "swap": 9, "contigu": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "optimizeinstructionmapp": [9, 13], "hf_model": 9, "alibaba": 9, "pai": 9, "qwen2": 9, "instruct": 9, "enable_vllm": 9, "tensor_parallel_s": 9, "max_model_len": 9, "max_num_seq": 9, "sampling_param": 9, "5b": 9, "hugginfac": 9, "vllm": 9, "infer": 9, "acceler": 9, "valid": 9, "tensor": 9, "parallel": 9, "unspecifi": 9, "automat": 9, "deriv": 9, "top_p": 9, "imageblurmapp": [9, 13], "p": 9, "blure": 9, "cleancopyrightmapp": [9, 13], "copyright": 9, "comment": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "cleanhtmlmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "videotaggingfromframesmapp": [9, 13], "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "extractqamapp": [9, 13], "qwen1_5": 9, "doc2qa": 9, "qa_format": 9, "chatml": 9, "question": 9, "answer": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "suitabl": 9, "interfac": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imagecaptioningmapp": [9, 13], "prompt_kei": 9, "anoth": 9, "how": 9, "mani": 9, "similar_on": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "generateinstructionmapp": [9, 13], "chat": 9, "seed_fil": 9, "instruct_num": 9, "similarity_threshold": 9, "prompt_templ": 9, "qa_pair_templ": 9, "example_templ": 9, "qa_extraction_pattern": 9, "your": 9, "put": 9, "templat": 9, "make": 9, "sure": 9, "augmented_data": 9, "\u95ee\u9898": 9, "\u56de\u7b54": 9, "qa_pair": 9, "\u5982\u4e0b\u662f\u4e00\u6761\u793a\u4f8b\u6570\u636e": 9, "load_seed_qa_sampl": 9, "qa": 9, "build_prompt": 9, "qa_sampl": 9, "parse_chatml_str": 9, "input_str": 9, "parse_respons": 9, "response_str": 9, "max_rouge_l_scor": 9, "fixunicodemapp": [9, 13], "fix": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "nlpaugenmapp": [9, 13], "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "insert_random_char": 9, "nlpaug": 9, "love": 9, "llm": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "videocaptioningfromframesmapp": [9, 13], "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "cleanemailmapp": [9, 13], "email": 9, "replacecontentmapp": [9, 13], "design": 9, "audioffmpegwrappedmapp": [9, 13], "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videofaceblurmapp": [9, 13], "imagetaggingmapp": [9, 13], "__dj__image_tags__": 9, "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Adapter"], [3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "Monitor"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Adapter": [[3, 4, 1, "", "MAX_BATCH_SIZE"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "adapt_workloads"], [3, 2, 1, "", "batch_size_strategy"], [3, 2, 1, "", "execute_and_probe"], [3, 2, 1, "", "probe_small_batch"], [3, 2, 1, "", "take_batch"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.Monitor": [[3, 4, 1, "", "DYNAMIC_FIELDS"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "analyze_resource_util_list"], [3, 2, 1, "", "analyze_single_resource_util"], [3, 2, 1, "", "monitor_all_resources"], [3, 2, 1, "", "monitor_current_resources"], [3, 2, 1, "", "monitor_func"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "load_from_disk"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "EmptyFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RayEmptyFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.EmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RayEmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats_batched"], [5, 2, 1, "", "compute_stats_single"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats_single"], [7, 2, 1, "", "process_single"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceCountFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImagePairSimilarityFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceCountFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImagePairSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractQAMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "GenerateInstructionMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "ImageTaggingMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "OptimizeInstructionMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractQAMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.GenerateInstructionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_prompt"], [9, 2, 1, "", "load_seed_qa_samples"], [9, 2, 1, "", "max_rouge_l_score"], [9, 2, 1, "", "parse_chatml_str"], [9, 2, 1, "", "parse_response"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageTaggingMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.OptimizeInstructionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "}": [[3, "id1"], [3, "id2"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "adapter (class in data_juicer.core)": [[3, "data_juicer.core.Adapter"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "dynamic_fields (data_juicer.core.monitor attribute)": [[3, "data_juicer.core.Monitor.DYNAMIC_FIELDS"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "max_batch_size (data_juicer.core.adapter attribute)": [[3, "data_juicer.core.Adapter.MAX_BATCH_SIZE"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "monitor (class in data_juicer.core)": [[3, "data_juicer.core.Monitor"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.__init__"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "adapt_workloads() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.adapt_workloads"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "analyze_resource_util_list() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_resource_util_list"]], "analyze_single_resource_util() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_single_resource_util"]], "batch_size_strategy() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.batch_size_strategy"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "execute_and_probe() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.execute_and_probe"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "load_from_disk() (data_juicer.core.nesteddataset static method)": [[3, "data_juicer.core.NestedDataset.load_from_disk"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "monitor_all_resources() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.monitor_all_resources"]], "monitor_current_resources() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_current_resources"]], "monitor_func() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_func"]], "probe_small_batch() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.probe_small_batch"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "take_batch() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.take_batch"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "emptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.EmptyFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "rayemptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.RayEmptyFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.emptyformatter attribute)": [[4, "data_juicer.format.EmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.rayemptyformatter attribute)": [[4, "data_juicer.format.RayEmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "null_value (data_juicer.format.emptyformatter property)": [[4, "data_juicer.format.EmptyFormatter.null_value"]], "null_value (data_juicer.format.rayemptyformatter property)": [[4, "data_juicer.format.RayEmptyFormatter.null_value"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_single"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "process_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_batched"]], "process_batched() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_batched"]], "process_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_single"]], "process_single() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_single"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats_single"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "process_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process_single"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefacecountfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imagepairsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats_single"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process_batched"]], "process_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process_single"]], "process_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process_single"]], "process_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process_single"]], "process_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process_single"]], "process_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process_single"]], "process_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process_single"]], "process_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process_single"]], "process_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process_single"]], "process_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process_single"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractQAMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "generateinstructionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "imagetaggingmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "optimizeinstructionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeInstructionMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.optimizeinstructionmapper method)": [[9, "data_juicer.ops.mapper.OptimizeInstructionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "build_prompt() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.build_prompt"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "load_seed_qa_samples() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.load_seed_qa_samples"]], "max_rouge_l_score() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.max_rouge_l_score"]], "parse_chatml_str() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.parse_chatml_str"]], "parse_response() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.parse_response"]], "process_batched() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process_batched"]], "process_single() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.process_single"]], "process_single() (data_juicer.ops.mapper.optimizeinstructionmapper method)": [[9, "data_juicer.ops.mapper.OptimizeInstructionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process_single"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7, 8, 9], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 3, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 3, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 3, 7, 8, 9], "inform": [1, 3, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 8, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": [2, 9], "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": [2, 3], "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 3, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 3, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": [2, 3, 4, 9], "origin": [2, 3, 8, 9], "expect": [2, 3, 9], "cfg_after_merg": 2, "adapt": [3, 13], "max_batch_s": 3, "10000": 3, "static": 3, "execute_and_prob": 3, "oper": 3, "sample_interv": 3, "0": [3, 4, 5, 7, 8, 9], "5": [3, 7, 8, 9], "input": [3, 5, 7, 8, 9, 10], "probe": 3, "relat": [3, 8], "op": [3, 13], "specifi": [3, 4, 6, 8, 9, 10], "For": [3, 5, 7, 8, 9], "now": [3, 6, 9], "we": [3, 4, 7, 8, 9, 13], "support": [3, 8, 9], "follow": [3, 9], "target": [3, 8, 10], "resourc": 3, "util": 3, "speed": 3, "averag": [3, 8], "The": [3, 4, 5, 8, 9, 10], "item": [3, 5], "take_batch": 3, "config": [3, 5, 9, 13], "split": [3, 6, 9], "batch": [3, 9], "factor": 3, "set": [3, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "iter": [3, 8, 9], "adapt_workload": 3, "manag": 3, "schedul": 3, "balanc": 3, "need": [3, 6, 8, 9, 10], "recip": 3, "probe_small_batch": 3, "perform": 3, "small": [3, 8], "pre": 3, "execut": 3, "avail": [3, 8], "current": 3, "estim": 3, "rank": [3, 8, 9, 10], "A": [3, 5, 7, 9], "length": [3, 4, 8, 9], "batch_size_strategi": 3, "load_analysis_r": 3, "base_b": 3, "util_th": 3, "9": [3, 8, 9], "decid": [3, 5, 7, 8], "accord": [3, 4, 5, 8, 9], "workload": 3, "analysi": [3, 13], "threshold": [3, 7, 8, 9], "guarante": 3, "won": [3, 7], "t": [3, 4, 6, 7], "exce": [3, 8, 9], "onli": [3, 7, 8, 9], "consid": [3, 7, 8, 9], "bucket": 3, "effect": 3, "which": [3, 5, 7, 8, 9], "max": [3, 4, 7, 8, 9], "except": [3, 9], "gpu": 3, "thi": [3, 4, 5, 6, 7, 8, 9, 10], "It": [3, 4, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "work_dir": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 9, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "load_from_disk": 3, "wa": 3, "previous": 3, "save_to_disk": 3, "directori": [3, 4, 8], "filesystem": 3, "ani": [3, 8, 9], "implement": [3, 7], "fsspec": 3, "spec": 3, "abstractfilesystem": 3, "dataset_path": [3, 4], "str": [3, 4, 6, 7, 8, 9, 10], "train": [3, 9], "remot": [3, 9], "uri": 3, "s3": 3, "my": 3, "where": 3, "f": [3, 4], "instanc": [3, 5], "deprec": 3, "version": [3, 9], "2": [3, 6, 8, 9], "8": [3, 8, 9], "3": [3, 8, 9], "pleas": [3, 7, 9], "storage_opt": 3, "instead": [3, 4, 6], "keep_in_memori": 3, "bool": [3, 7, 8, 9, 10], "copi": 3, "memori": 3, "unless": 3, "explicitli": 3, "enabl": [3, 9], "in_memory_max_s": 3, "nonzero": 3, "see": [3, 13], "more": [3, 8, 9, 13], "detail": [3, 13], "improv": 3, "section": 3, "kei": [3, 4, 5, 8, 9, 10], "pair": [3, 5, 7, 8, 9], "pass": [3, 9], "system": [3, 9], "backend": 3, "ad": [3, 6, 9], "request": [3, 9], "datasetdict": 3, "exampl": [3, 8, 9], "py": [3, 4], "d": [3, 4], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 7, 8, 9, 10], "sample_algo": 3, "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "ratio": [3, 4, 6, 8, 9, 10], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "to_jsonl": 3, "jsonl": [3, 4], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "monitor": [3, 13], "other": [3, 8, 9], "dure": 3, "python": 3, "time": [3, 9], "10": [3, 8, 9], "timestamp": 3, "xxx": 3, "cpu": 3, "count": [3, 8], "free": 3, "mem": 3, "structur": 3, "abov": [3, 9], "field": [3, 4, 5, 7, 8, 9, 10], "first": [3, 6, 7, 8, 9], "level": [3, 5, 6, 7, 8, 9, 10], "resource_analysi": 3, "min": [3, 7, 8, 9], "avg": [3, 8], "those": [3, 8], "dynamic_field": 3, "monitor_all_resourc": 3, "detect": [3, 7, 8, 9], "node": 3, "monitor_current_resourc": 3, "machin": 3, "rang": [3, 8, 9, 10], "mb": [3, 8], "analyze_resource_util_list": 3, "resource_util_list": 3, "metric": [3, 5, 7, 8], "analyze_single_resource_util": 3, "resource_util_dict": 3, "monitor_func": 3, "show_num": [3, 5, 7], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "two": [3, 7, 8, 9], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "generated_dataset_config": [4, 9], "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "creat": 4, "provid": [4, 7, 9], "must": [4, 8, 9], "contain": [4, 6, 8, 9], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 7, 8, 9, 10], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": [4, 8, 9], "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8, 9], "datasset": 4, "dir": 4, "w1": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "random_sampl": 4, "sample_numb": 4, "seed": [4, 9], "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "random": [4, 9, 10], "42": 4, "emptyformatt": [4, 9, 13], "feature_kei": [4, 9], "empti": [4, 7, 9], "featur": 4, "properti": 4, "null_valu": 4, "rayemptyformatt": [4, 9, 13], "rai": [4, 7, 9], "load_op": [5, 13], "process_list": 5, "op_fus": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stats_batch": [5, 8], "process_batch": [5, 8, 9], "compute_stats_singl": [5, 7, 8], "context": [5, 7, 8, 9], "var": [5, 7, 8], "temporarili": [5, 7, 8], "process_singl": [5, 7, 8, 9], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "documentdedupl": [7, 13], "ignore_non_charact": 7, "exact": 7, "match": [7, 8, 9], "md5": 7, "ignor": [7, 9], "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "punctuat": [7, 9], "documentminhashdedupl": [7, 13], "window_s": 7, "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "kept": [7, 8, 9], "final": [7, 9], "should": [7, 8, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "shingl": 7, "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "jaccard": 7, "similar": [7, 8, 9], "regard": 7, "band": 7, "lsh": 7, "determin": [7, 9, 10], "optim": [7, 9], "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "imagededupl": [7, 13], "phash": 7, "consider_text": 7, "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "6380": 7, "basic": 7, "although": 7, "empty_hash_valu": 7, "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "raydocumentdedupl": [7, 13], "rayimagededupl": [7, 13], "rayvideodedupl": [7, 13], "videodedupl": [7, 13], "alphanumericfilt": [8, 13], "min_ratio": [8, 9], "25": 8, "max_ratio": [8, 9], "9223372036854775807": [8, 9], "numer": 8, "within": [8, 9, 10], "alphanumer": 8, "total": [8, 9], "below": [8, 9], "audiodurationfilt": [8, 13], "min_dur": 8, "max_dur": 8, "any_or_al": [8, 9], "durat": [8, 9], "second": [8, 9], "sy": 8, "maxsiz": 8, "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "averagelinelengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "flaggedwordfilt": [8, 13], "lang": [8, 9], "045": 8, "flagged_words_dir": 8, "home": 8, "runner": 8, "asset": 8, "flag": 8, "what": 8, "adopt": 8, "flagged_word": 8, "join": 8, "imageaestheticsfilt": [8, 13], "hf_scorer_model": 8, "trust_remote_cod": [8, 9], "min_scor": 8, "max_scor": 8, "aesthet": 8, "score": [8, 9], "predictor": 8, "By": [8, 9], "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "keyword": [8, 9], "imageaspectratiofilt": [8, 13], "333": 8, "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "imagefacecountfilt": [8, 13], "cv_classifi": [8, 9], "min_face_count": 8, "max_face_count": 8, "face": [8, 9], "opencv": [8, 9], "classifi": [8, 9], "haarcascade_frontalface_alt": [8, 9], "minimum": [8, 9], "requir": 8, "imagefaceratiofilt": [8, 13], "area": 8, "largest": [8, 10], "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "imagepairsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "closedunitinterv": 8, "imageshapefilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "shape": 8, "width": [8, 9], "height": [8, 9], "imagesizefilt": [8, 13], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "reduce_mod": 8, "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 9, 10], "chunk": 8, "take": 8, "imagetextsimilarityfilt": [8, 13], "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "watermark": [8, 9], "high": 8, "probabl": [8, 9], "languageidscorefilt": [8, 13], "confid": 8, "larger": [8, 9, 10], "identif": 8, "maximumlinelengthfilt": [8, 13], "perplexityfilt": [8, 13], "max_ppl": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": [8, 9], "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "out": 8, "larg": 8, "account": 8, "specialcharactersfilt": [8, 13], "specifiedfieldfilt": [8, 13], "field_kei": [8, 10], "target_valu": 8, "multi": [8, 10, 13], "retain": [8, 9], "specifiednumericfieldfilt": [8, 13], "min_valu": 8, "max_valu": 8, "specifiednumericfield": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "suffixfilt": [8, 13], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "zh": 8, "mini_action_num": 8, "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "mini_dependency_num": 8, "edg": 8, "depend": [8, 9], "objet": 8, "textlengthfilt": [8, 13], "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "hug": [8, 9], "videoaestheticsfilt": [8, 13], "frame_sampling_method": [8, 9], "frame_num": [8, 9], "frame": [8, 9], "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "while": 8, "usual": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "addit": [8, 9], "videoaspectratiofilt": [8, 13], "21": [8, 9], "videodurationfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "chineseclip": 8, "might": [8, 9], "choic": 8, "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "tupl": 8, "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "sequenc": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "equal": [8, 9, 10], "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "videonsfwfilt": [8, 13], "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "found": [8, 9], "here": [8, 9, 13], "http": [8, 9], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videoresolutionfilt": [8, 13], "resolut": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "tag_field_nam": [8, 9], "__dj__video_frame_tags__": [8, 9], "shift": 8, "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "videowatermarkfilt": [8, 13], "wordrepetitionfilt": [8, 13], "wordsnumfilt": [8, 13], "audioffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "output": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "cleancopyrightmapp": [9, 13], "clean": 9, "copyright": 9, "comment": 9, "begin": 9, "cleanemailmapp": [9, 13], "repl": 9, "email": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanhtmlmapp": [9, 13], "cleanipmapp": [9, 13], "ipv4": 9, "ipv6": 9, "address": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "latex": 9, "fixunicodemapp": [9, 13], "fix": 9, "unicod": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "generateqafromexamplesmapp": [9, 13], "hf_model": 9, "qwen": 9, "qwen2": 9, "7b": 9, "instruct": 9, "seed_fil": 9, "example_num": 9, "similarity_threshold": 9, "system_prompt": 9, "input_templ": 9, "example_templ": 9, "qa_pair_templ": 9, "output_pattern": 9, "enable_vllm": 9, "model_param": 9, "sampling_param": 9, "question": 9, "answer": 9, "you": 9, "your": 9, "default_system_prompt": 9, "\u8bf7\u4f60\u4ed4\u7ec6\u89c2\u5bdf\u591a\u4e2a\u793a\u4f8b\u6570\u636e\u7684\u8f93\u5165\u548c\u8f93\u51fa": 9, "\u6309\u7167\u4f60\u7684\u7406\u89e3": 9, "\u603b\u7ed3\u51fa\u76f8\u5e94\u89c4\u77e9": 9, "\u7136\u540e\u5199\u51fa\u4e00\u4e2a\u65b0\u7684": 9, "\u95ee\u9898": 9, "\u548c": 9, "\u56de\u7b54": 9, "\u6ce8\u610f": 9, "\u65b0\u751f\u6210\u7684": 9, "\u9700\u8981\u6ee1\u8db3\u5982\u4e0b\u8981\u6c42": 9, "n1": 9, "\u751f\u6210\u7684": 9, "\u4e0d\u80fd\u4e0e\u8f93\u5165\u7684": 9, "\u4e00\u81f4": 9, "\u4f46\u662f\u9700\u8981\u4fdd\u6301\u683c\u5f0f\u76f8\u540c": 9, "n2": 9, "\u4e0d\u4e00\u5b9a\u8981\u5c40\u9650\u4e8e\u8f93\u5165": 9, "\u7684\u8bdd\u9898\u6216\u9886\u57df": 9, "\u9700\u8981\u6b63\u786e\u56de\u7b54\u751f\u6210\u7684": 9, "n3": 9, "\u63d0\u4f9b\u7684": 9, "\u53ef\u80fd\u662f\u591a\u8f6e\u5bf9\u8bdd": 9, "\u4e5f\u53ef\u4ee5\u662f\u591a\u8f6e": 9, "n4": 9, "\u5fc5\u987b\u6210\u5bf9\u51fa\u73b0": 9, "\u800c\u4e14": 9, "\u9700\u8981\u5728": 9, "\u4e4b\u524d": 9, "default_input_templ": 9, "default_example_templ": 9, "n\u5982\u4e0b\u662f\u4e00\u6761\u793a\u4f8b\u6570\u636e": 9, "default_qa_pair_templ": 9, "default_output_pattern": 9, "hugginfac": 9, "id": 9, "chatml": 9, "put": 9, "prompt": 9, "qa": 9, "guid": 9, "task": 9, "templat": 9, "build": 9, "placehold": 9, "defin": 9, "qa_pair": 9, "respons": 9, "vllm": 9, "infer": 9, "acceler": 9, "temperatur": 9, "top_p": 9, "build_input": 9, "qa_exampl": 9, "parse_output": 9, "raw_output": 9, "generateqafromtextmapp": [9, 13], "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "suitabl": 9, "interfac": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imageblurmapp": [9, 13], "p": 9, "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "blure": 9, "kernel": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "user_prompt": 9, "user_prompt_kei": 9, "keep_original_sampl": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "control": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "uers_prompt_kei": 9, "imagecaptioningmapp": [9, 13], "hf_img2seq": 9, "blip2": 9, "opt": 9, "caption_num": 9, "keep_candidate_mod": 9, "random_ani": 9, "prompt_kei": 9, "caption": 9, "anoth": 9, "how": 9, "mani": 9, "candid": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "similar_on": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "point": 9, "fp16": 9, "bf16": 9, "branch": 9, "commit": 9, "git": 9, "extent": 9, "transform": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "produc": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "imagetaggingmapp": [9, 13], "__dj__image_tags__": 9, "nlpaugenmapp": [9, 13], "sequenti": 9, "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "delete_random_char": 9, "swap_random_char": 9, "insert_random_char": 9, "simpli": 9, "nlpaug": 9, "librari": 9, "semant": 9, "significantli": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "delet": 9, "love": 9, "llm": 9, "swap": 9, "contigu": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "nlpcdazhmapp": [9, 13], "replace_similar_word": 9, "replace_homophone_char": 9, "replace_equivalent_num": 9, "nlpcda": 9, "notic": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "optimizeqamapp": [9, 13], "\u8bf7\u4f18\u5316\u8f93\u5165\u7684\u95ee\u7b54\u5bf9": 9, "\u4f7f": 9, "\u90fd\u66f4\u52a0\u8be6\u7ec6": 9, "\u51c6\u786e": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f": 9, "\u76f4\u63a5\u8f93\u51fa\u4f18\u5316\u540e\u7684\u95ee\u7b54\u5bf9": 9, "n\u4f18\u5316\u540e\u7684\u95ee\u9898": 9, "n\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "make": 9, "sure": 9, "optimizequerymapp": [9, 13], "queri": 9, "\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u5c06\u5176\u66f4\u52a0\u8be6\u7ec6\u5177\u4f53": 9, "\u4f46\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684": 9, "\u4e0d\u8981\u8f93\u51fa\u591a\u4f59\u5185\u5bb9": 9, "optimizeresponsemapp": [9, 13], "\u8bf7\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u4f46\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "punctuationnormalizationmapp": [9, 13], "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "drop": 9, "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "letter": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "removetabletextmapp": [9, 13], "min_col": 9, "max_col": 9, "20": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "replacecontentmapp": [9, 13], "design": 9, "sentencesplitmapp": [9, 13], "videocaptioningfromaudiomapp": [9, 13], "stream": 9, "videocaptioningfromframesmapp": [9, 13], "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videoffmpegwrappedmapp": [9, 13], "videofaceblurmapp": [9, 13], "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "cut": 9, "split_videos_by_dur": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "progress": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "__dj__video_audio_tags__": 9, "spectrogram": 9, "hf": 9, "trust": 9, "videotaggingfromframesmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Adapter"], [3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "Monitor"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Adapter": [[3, 4, 1, "", "MAX_BATCH_SIZE"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "adapt_workloads"], [3, 2, 1, "", "batch_size_strategy"], [3, 2, 1, "", "execute_and_probe"], [3, 2, 1, "", "probe_small_batch"], [3, 2, 1, "", "take_batch"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.Monitor": [[3, 4, 1, "", "DYNAMIC_FIELDS"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "analyze_resource_util_list"], [3, 2, 1, "", "analyze_single_resource_util"], [3, 2, 1, "", "monitor_all_resources"], [3, 2, 1, "", "monitor_current_resources"], [3, 2, 1, "", "monitor_func"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "load_from_disk"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "EmptyFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RayEmptyFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.EmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RayEmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats_batched"], [5, 2, 1, "", "compute_stats_single"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats_single"], [7, 2, 1, "", "process_single"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceCountFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImagePairSimilarityFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceCountFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImagePairSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "GenerateQAFromExamplesMapper"], [9, 1, 1, "", "GenerateQAFromTextMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "ImageTaggingMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "OptimizeQAMapper"], [9, 1, 1, "", "OptimizeQueryMapper"], [9, 1, 1, "", "OptimizeResponseMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.GenerateQAFromExamplesMapper": [[9, 4, 1, "", "DEFAULT_EXAMPLE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.GenerateQAFromTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageTaggingMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.OptimizeQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.OptimizeQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.OptimizeResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "}": [[3, "id1"], [3, "id2"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "adapter (class in data_juicer.core)": [[3, "data_juicer.core.Adapter"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "dynamic_fields (data_juicer.core.monitor attribute)": [[3, "data_juicer.core.Monitor.DYNAMIC_FIELDS"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "max_batch_size (data_juicer.core.adapter attribute)": [[3, "data_juicer.core.Adapter.MAX_BATCH_SIZE"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "monitor (class in data_juicer.core)": [[3, "data_juicer.core.Monitor"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.__init__"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "adapt_workloads() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.adapt_workloads"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "analyze_resource_util_list() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_resource_util_list"]], "analyze_single_resource_util() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_single_resource_util"]], "batch_size_strategy() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.batch_size_strategy"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "execute_and_probe() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.execute_and_probe"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "load_from_disk() (data_juicer.core.nesteddataset static method)": [[3, "data_juicer.core.NestedDataset.load_from_disk"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "monitor_all_resources() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.monitor_all_resources"]], "monitor_current_resources() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_current_resources"]], "monitor_func() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_func"]], "probe_small_batch() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.probe_small_batch"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "take_batch() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.take_batch"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "emptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.EmptyFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "rayemptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.RayEmptyFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.emptyformatter attribute)": [[4, "data_juicer.format.EmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.rayemptyformatter attribute)": [[4, "data_juicer.format.RayEmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "null_value (data_juicer.format.emptyformatter property)": [[4, "data_juicer.format.EmptyFormatter.null_value"]], "null_value (data_juicer.format.rayemptyformatter property)": [[4, "data_juicer.format.RayEmptyFormatter.null_value"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_single"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "process_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_batched"]], "process_batched() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_batched"]], "process_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_single"]], "process_single() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_single"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats_single"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "process_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process_single"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefacecountfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imagepairsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats_single"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process_batched"]], "process_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process_single"]], "process_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process_single"]], "process_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process_single"]], "process_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process_single"]], "process_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process_single"]], "process_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process_single"]], "process_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process_single"]], "process_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process_single"]], "process_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process_single"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "default_example_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_EXAMPLE_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_INPUT_TEMPLATE"]], "default_output_pattern (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_OUTPUT_PATTERN"]], "default_qa_pair_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_system_prompt (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizequerymapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeresponsemapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.DEFAULT_SYSTEM_PROMPT"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "generateqafromexamplesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper"]], "generateqafromtextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "imagetaggingmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "optimizeqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper"]], "optimizequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper"]], "optimizeresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "build_input() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.build_input"]], "build_input() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.build_input"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "parse_output() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizequerymapper method)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeresponsemapper method)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.parse_output"]], "process_batched() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process_batched"]], "process_single() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.process_single"]], "process_single() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process_single"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file