diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html
index f20745f11..c8a2a259f 100644
--- a/_modules/data_juicer/core/data.html
+++ b/_modules/data_juicer/core/data.html
@@ -325,9 +325,10 @@
if inspect . ismethod ( called_func ):
# batched is required for fault-tolerant or batched OP
- if not called_func . __self__ . turbo or hasattr (
+ if callable ( getattr (
called_func . __self__ ,
- 'is_batched_op' ) and called_func . __self__ . is_batched_op ():
+ 'is_batched_op' )) and called_func . __self__ . is_batched_op (
+ ) or not called_func . __self__ . turbo :
kargs [ 'batched' ] = True
kargs [ 'batch_size' ] = kargs . pop ( 'batch_size' , 1 ) if hasattr (
called_func . __self__ , 'is_batched_op'
@@ -335,6 +336,12 @@ Source code for data_juicer.core.data
else :
kargs [ 'batched' ] = False
+ # rank is required for cuda model loading
+ if callable (
+ getattr ( called_func . __self__ ,
+ 'use_cuda' )) and called_func . __self__ . use_cuda ():
+ kargs [ 'with_rank' ] = True
+
if 'new_fingerprint' not in kargs or kargs [ 'new_fingerprint' ] is None :
new_fingerprint = generate_fingerprint ( self , * args , ** kargs )
kargs [ 'new_fingerprint' ] = new_fingerprint
@@ -379,10 +386,12 @@ Source code for data_juicer.core.data
called_func = called_func . __wrapped__
# Batched is always required for fault tolerance
- if inspect . ismethod (
- called_func ) and called_func . __self__ . is_batched_op ():
- kargs [ 'batched' ] = True
- kargs [ 'batch_size' ] = kargs . pop ( 'batch_size' , 1 )
+ if inspect . ismethod ( called_func ):
+ if callable ( getattr (
+ called_func . __self__ ,
+ 'is_batched_op' )) and called_func . __self__ . is_batched_op ():
+ kargs [ 'batched' ] = True
+ kargs [ 'batch_size' ] = kargs . pop ( 'batch_size' , 1 )
if 'new_fingerprint' not in kargs or kargs [ 'new_fingerprint' ] is None :
new_fingerprint = generate_fingerprint ( self , * args , ** kargs )
diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html
index d6b0bf3f8..d736b81d0 100644
--- a/_modules/data_juicer/ops/base_op.html
+++ b/_modules/data_juicer/ops/base_op.html
@@ -81,6 +81,7 @@ Source code for data_juicer.ops.base_op
import traceback
from functools import wraps
+import numpy as np
import pyarrow as pa
from loguru import logger
@@ -212,6 +213,11 @@ Source code for data_juicer.ops.base_op
self . image_key = kwargs . get ( 'image_key' , 'images' )
self . audio_key = kwargs . get ( 'audio_key' , 'audios' )
self . video_key = kwargs . get ( 'video_key' , 'videos' )
+
+ self . query_key = kwargs . get ( 'query_key' , 'query' )
+ self . response_key = kwargs . get ( 'response_key' , 'response' )
+ self . history_key = kwargs . get ( 'history_key' , 'history' )
+
self . batch_size = kwargs . get ( 'batch_size' , 1000 )
# whether the model can be accelerated using cuda
@@ -289,6 +295,9 @@ Source code for data_juicer.ops.base_op
dataset = NestedDataset ( dataset )
return dataset
+ def empty_history ( self ):
+ return np . empty (( 0 , 0 ), dtype = str )
+
[docs] class Mapper ( OP ):
diff --git a/_modules/data_juicer/ops/mapper/extract_qa_mapper.html b/_modules/data_juicer/ops/mapper/extract_qa_mapper.html
deleted file mode 100644
index 4984076cb..000000000
--- a/_modules/data_juicer/ops/mapper/extract_qa_mapper.html
+++ /dev/null
@@ -1,275 +0,0 @@
-
-
-
-
-
-
-
-
data_juicer.ops.mapper.extract_qa_mapper — data_juicer 0.2.0 documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- data_juicer
-
-
-
-
-
-
-
-
-
Source code for data_juicer.ops.mapper.extract_qa_mapper
-import json
-import re
-from typing import Dict , Optional
-
-from loguru import logger
-
-from data_juicer.ops.base_op import OPERATORS , UNFORKABLE , Mapper
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import get_model , prepare_model
-
-torch = LazyLoader ( 'torch' , 'torch' )
-vllm = LazyLoader ( 'vllm' , 'vllm' )
-
-OP_NAME = 'extract_qa_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/_modules/data_juicer/ops/mapper/generate_instruction_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html
similarity index 50%
rename from _modules/data_juicer/ops/mapper/generate_instruction_mapper.html
rename to _modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html
index e97503fa1..455dfa757 100644
--- a/_modules/data_juicer/ops/mapper/generate_instruction_mapper.html
+++ b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html
@@ -5,7 +5,7 @@
-
data_juicer.ops.mapper.generate_instruction_mapper — data_juicer 0.2.0 documentation
+
data_juicer.ops.mapper.generate_qa_from_examples_mapper — data_juicer 0.2.0 documentation
@@ -67,7 +67,7 @@
Module code
data_juicer
-
data_juicer.ops.mapper.generate_instruction_mapper
+
data_juicer.ops.mapper.generate_qa_from_examples_mapper
@@ -76,7 +76,7 @@
-
Source code for data_juicer.ops.mapper.generate_instruction_mapper
+ Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper
import json
import random
import re
@@ -94,26 +94,15 @@ Source code for data_juicer.ops.mapper.generate_instruction_mapper vllm
= LazyLoader ( 'vllm' , 'vllm' )
rouge = LazyLoader ( 'rouge' , 'rouge' )
-
DEFAULT_PROMPT_TEMPLATE = """
-
请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求:
-
1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。
-
2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。
-
3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。
-
4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。
-
{augmented_data}
-
"""
-
QA_EXTRACTION_PATTERN = r '【问题】\s*(.*?)\s*【回答】\s*(.*?)\s*(?=【问题】|$)'
-
EXAMPLE_TEMPLATE = ' \n 如下是一条示例数据: \n\n {qa_pairs} '
-
QA_PAIR_TEMPLATE = '【问题】 \n {} \n 【回答】 \n {} \n '
-
-
OP_NAME = 'generate_instruction_mapper'
+
OP_NAME = 'generate_qa_from_examples_mapper'
# TODO: Extend LLM-based OPs into API-based implementation.
-
[docs] @UNFORKABLE . register_module ( OP_NAME )
+
[docs] @UNFORKABLE . register_module ( OP_NAME )
@OPERATORS . register_module ( OP_NAME )
-
class GenerateInstructionMapper ( Mapper ):
-
"""Mapper to generate new instruction text data.
+
class GenerateQAFromExamplesMapper ( Mapper ):
+
"""
+
Mapper to generate question and answer pairs from examples.
You should configure an empty dataset in your yaml config file:
```
generated_dataset_config:
@@ -124,161 +113,148 @@
Source code for data_juicer.ops.mapper.generate_instruction_mapper The number of samples generated is determined by
the length of the empty dataset.
"""
+
+
DEFAULT_SYSTEM_PROMPT = (
+
'请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。'
+
'注意,新生成的【问题】和【回答】需要满足如下要求: \n '
+
'1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。 \n '
+
'2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。 \n '
+
'3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。 \n '
+
'4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。 \n ' )
+
+
DEFAULT_INPUT_TEMPLATE = ' {} '
+
DEFAULT_EXAMPLE_TEMPLATE = ' \n 如下是一条示例数据: \n {} '
+
DEFAULT_QA_PAIR_TEMPLATE = '【问题】 \n {} \n 【回答】 \n {} \n '
+
DEFAULT_OUTPUT_PATTERN = r '【问题】(.*?)【回答】(.*?)(?=【问题】|$)'
+
_accelerator = 'cuda'
-
[docs] def __init__ ( self ,
-
hf_model : str = 'Qwen/Qwen-7B-Chat' ,
+
[docs] def __init__ ( self ,
+
hf_model : str = 'Qwen/Qwen2.5-7B-Instruct' ,
+
* ,
seed_file : str = '' ,
-
instruct_num : PositiveInt = 3 ,
-
trust_remote_code : bool = False ,
+
example_num : PositiveInt = 3 ,
similarity_threshold : float = 0.7 ,
-
prompt_template : Optional [ str ] = None ,
-
qa_pair_template : Optional [ str ] = None ,
+
system_prompt : Optional [ str ] = None ,
+
input_template : Optional [ str ] = None ,
example_template : Optional [ str ] = None ,
-
qa_extraction_pattern : Optional [ str ] = None ,
-
enable_vllm : bool = True ,
-
tensor_parallel_size : Optional [ int ] = None ,
-
max_model_len : Optional [ int ] = None ,
-
max_num_seqs : int = 256 ,
-
sampling_params : Dict = {},
-
* args ,
+
qa_pair_template : Optional [ str ] = None ,
+
output_pattern : Optional [ str ] = None ,
+
enable_vllm : bool = False ,
+
model_params : Optional [ Dict ] = None ,
+
sampling_params : Optional [ Dict ] = None ,
** kwargs ):
"""
Initialization method.
-
:param hf_model: Hugginface model id.
-
:param seed_file: Seed file path, chatml format.
-
:param instruct_num: The number of instruction samples.
-
Randomly select N samples from "seed_file" and
-
put them into prompt as instruction samples.
-
:param trust_remote_code: passed to transformers
+
:param hf_model: Hugginface model ID.
+
:param seed_file: Path to the seed file in chatml format.
+
:param example_num: The number of selected examples.
+
Randomly select N examples from "seed_file" and
+
put them into prompt as QA examples.
:param similarity_threshold: The similarity score threshold
-
between the generated samples and the seed samples.
+
between the generated samples and the seed examples.
Range from 0 to 1. Samples with similarity score less than
this threshold will be kept.
-
:param prompt_template: Prompt template for generate samples.
-
Please make sure the template contains "{augmented_data}",
-
which corresponds to the augmented samples.
-
:param qa_pair_template: Prompt template for generate question
-
and answer pair description. Please make sure the template
-
contains two "{}" to format question and answer.
-
Default: '【问题】\n{}\n【回答】\n{}\n'.
-
:param example_template: Prompt template for generate examples.
-
Please make sure the template contains "{qa_pairs}", which
-
corresponds to the question and answer pair description
-
generated by param `qa_pair_template`.
-
Default: '\n如下是一条示例数据:\n\n{qa_pairs}'
-
:param qa_extraction_pattern: Regular expression pattern for parsing
-
question and answer from model response.
+
:param system_prompt: System prompt for guiding the generation task.
+
:param input_template: Template for building the input prompt. It must
+
include one placeholder '{}', which will be replaced by
+
`example_num` formatted examples defined by `example_template`.
+
:param example_template: Template for formatting one QA example. It
+
must include one placeholder '{}', which will be replaced by one
+
formatted qa_pair.
+
:param qa_pair_template: Template for formatting a single QA pair
+
within each example. Must include two placeholders '{}' for the
+
question and answer.
+
:param output_pattern: Regular expression pattern to extract questions
+
and answers from model response.
:param enable_vllm: Whether to use vllm for inference acceleration.
-
:param tensor_parallel_size: It is only valid when enable_vllm is True.
-
The number of GPUs to use for distributed execution with tensor
-
parallelism.
-
:param max_model_len: It is only valid when enable_vllm is True.
-
Model context length. If unspecified, will be automatically
-
derived from the model config.
-
:param max_num_seqs: It is only valid when enable_vllm is True.
-
Maximum number of sequences to be processed in a single iteration.
+
:param model_params: Parameters for initializing the model.
:param sampling_params: Sampling parameters for text generation.
e.g {'temperature': 0.9, 'top_p': 0.95}
-
:param args: extra args
-
:param kwargs: extra args
+
:param kwargs: Extra keyword arguments.
"""
-
super () . __init__ ( * args , ** kwargs )
-
self . num_proc = 1
+
super () . __init__ ( ** kwargs )
if not seed_file :
raise ValueError (
'Please provide `seed_file` in chatml format.'
'Example: data-juicer/demos/data/demo-dataset-chatml.jsonl' )
-
self . instruct_num = instruct_num
+
self . seed_file = seed_file
+
self . example_num = example_num
self . similarity_threshold = similarity_threshold
self . similarity_type = 'rouge_l'
-
if prompt_template is None :
-
prompt_template = DEFAULT_PROMPT_TEMPLATE
-
if qa_pair_template is None :
-
qa_pair_template = QA_PAIR_TEMPLATE
-
if example_template is None :
-
example_template = EXAMPLE_TEMPLATE
-
if qa_extraction_pattern is None :
-
qa_extraction_pattern = QA_EXTRACTION_PATTERN
-
-
self . prompt_template = prompt_template
-
self . qa_pair_template = qa_pair_template
-
self . example_template = example_template
-
self . qa_extraction_pattern = qa_extraction_pattern
+
self . system_prompt = system_prompt or self . DEFAULT_SYSTEM_PROMPT
+
self . input_template = input_template or self . DEFAULT_INPUT_TEMPLATE
+
self . example_template = example_template or self . DEFAULT_EXAMPLE_TEMPLATE # noqa: E501
+
self . qa_pair_template = qa_pair_template or \
+
self . DEFAULT_QA_PAIR_TEMPLATE
+
self . output_pattern = output_pattern or self . DEFAULT_OUTPUT_PATTERN
self . enable_vllm = enable_vllm
+
model_params = model_params or {}
+
sampling_params = sampling_params or {}
if enable_vllm :
-
assert torch . cuda . device_count () >= 1 , 'must be executed in CUDA'
-
if not tensor_parallel_size :
+
# cannot initialize vllm replicas on different GPUs
+
self . num_proc = 1
+
if model_params . get ( 'tensor_parallel_size' ) is None :
tensor_parallel_size = torch . cuda . device_count ()
logger . info ( f 'Set tensor_parallel_size to \
{ tensor_parallel_size } for vllm.' )
+
model_params [ 'tensor_parallel_size' ] = tensor_parallel_size
self . model_key = prepare_model (
model_type = 'vllm' ,
pretrained_model_name_or_path = hf_model ,
-
trust_remote_code = trust_remote_code ,
-
tensor_parallel_size = tensor_parallel_size ,
-
max_model_len = max_model_len ,
-
max_num_seqs = max_num_seqs )
+
** model_params )
self . sampling_params = vllm . SamplingParams ( ** sampling_params )
else :
self . model_key = prepare_model (
model_type = 'huggingface' ,
pretrained_model_name_or_path = hf_model ,
-
trust_remote_code = trust_remote_code )
+
return_pipe = True ,
+
** model_params )
self . sampling_params = sampling_params
-
self . seed_qa_samples = self . load_seed_qa_samples ( seed_file )
-
+
self . seed_qa_samples = self . _load_seed_qa_samples ()
if len ( self . seed_qa_samples ) == 0 :
-
raise ValueError ( 'No QA data was parsed from the seed file!' )
+
raise ValueError ( 'No QA data was parsed from the seed file!' )
-
self . reference_samples = [
-
' \n ' . join ([ ' \n ' . join ( qa_pair ) for qa_pair in qa_pairs ]) + ' \n '
-
for qa_pairs in self . seed_qa_samples
-
]
-
-
[docs] def load_seed_qa_samples ( self , seed_file ):
+
def _load_seed_qa_samples ( self ):
"""Load QA pairs from chatml format file."""
qa_samples = []
-
with open ( seed_file ) as f :
+
with open ( self . seed_file , encoding = 'utf-8' ) as f :
lines = f . readlines ()
for line in lines :
line = line . strip ()
-
qa_pairs = self . parse_chatml_str ( line )
+
qa_pairs = self . _parse_chatml_str ( line )
if len ( qa_pairs ) > 0 :
qa_samples . append ( qa_pairs )
+
return qa_samples
-
return qa_samples
-
-
[docs] def build_prompt ( self , qa_samples , prompt_template ):
+
def _sample_to_str ( self , qa_sample ):
+
return ' \n ' . join ([ ' \n ' . join ( qa_pair ) for qa_pair in qa_sample ]) + ' \n '
-
def format_qa_pairs ( qa_pairs ):
-
return '' . join ([
-
self . qa_pair_template . format ( q , a ) for q , a in qa_pairs
-
if q and a
-
])
-
-
body_fragments = [
-
self . example_template . format ( qa_pairs = format_qa_pairs ( qa_pairs ))
-
for qa_pairs in qa_samples
-
]
-
-
body = '' . join ( body_fragments )
-
-
return prompt_template . format ( augmented_data = body )
+
def _max_rouge_l_score ( self , hypothesis , references ):
+
r = rouge . Rouge ()
+
max_score = 0.0
+
hyp_str = self . _sample_to_str ( hypothesis )
+
for reference in references :
+
ref_str = self . _sample_to_str ( reference )
+
scores = r . get_scores ( hyp_str , ref_str )
+
rouge_l_score = scores [ 0 ][ 'rouge-l' ][ 'f' ]
+
if rouge_l_score > max_score :
+
max_score = rouge_l_score
+
return max_score
-
[docs] def parse_chatml_str ( self , input_str ):
+
def _parse_chatml_str ( self , sample_str ):
user_input = None
assistant_output = None
qa_pairs = []
-
data = json . loads ( input_str )
+
data = json . loads ( sample_str )
for message in data [ 'messages' ]:
role = message [ 'role' ]
content = message [ 'content' ]
@@ -287,79 +263,91 @@
Source code for data_juicer.ops.mapper.generate_instruction_mapper elif role == 'assistant' :
assistant_output = content
qa_pairs . append (( user_input , assistant_output ))
- return qa_pairs
-
-
[docs] def parse_response ( self , response_str ):
-
pattern = self . qa_extraction_pattern
-
matches = re . findall ( pattern , response_str , re . DOTALL )
-
response_str = ''
-
out_qa_pairs = []
-
for i , match in enumerate ( matches ):
-
question , answer = match
-
question = question . strip ()
-
answer = answer . strip ()
-
out_qa_pairs . append (( question , answer ))
-
response_str += question + ' \n ' + answer + ' \n '
+
return qa_pairs
-
if len ( out_qa_pairs ) == 0 :
-
logger . error ( 'Parse model response error! '
-
'No data generated for the current response!' )
+
-
-
[docs] def max_rouge_l_score ( self , reference , candidates ):
+
def format_qa_pairs ( qa_example ):
+
return '' . join ([
+
self . qa_pair_template . format ( q , a ) for q , a in qa_example
+
if q and a
+
])
-
r = rouge . Rouge ()
-
max_score = 0.0
-
for candidate in candidates :
-
scores = r . get_scores ( candidate , reference )
-
rouge_l_score = scores [ 0 ][ 'rouge-l' ][ 'f' ]
-
if rouge_l_score > max_score :
-
max_score = rouge_l_score
-
return max_score
+
formatted_examples = '' . join ([
+
self . example_template . format ( qa_pairs = format_qa_pairs ( qa_example ))
+
for qa_example in qa_examples
+
])
+
input_prompt = self . input_template . format ( examples = formatted_examples )
+
return input_prompt
+
+
[docs] def parse_output ( self , raw_output ):
+
logger . debug ( raw_output )
+
output_qa_pairs = []
+
matches = re . findall ( self . output_pattern , raw_output , re . DOTALL )
+
for match in matches :
+
question , answer = match
+
output_qa_pairs . append (( question . strip (), answer . strip ()))
+
return output_qa_pairs
-
[docs] def process_single ( self , sample = None , rank = None ):
-
model , processor = get_model ( self . model_key , rank = rank )
+
[docs] def process_single ( self , sample = None , rank = None ):
+
model , _ = get_model ( self . model_key , rank , self . use_cuda ())
random_qa_samples = random . sample ( self . seed_qa_samples ,
-
self . instruct_num )
-
input_prompt = self . build_prompt ( random_qa_samples ,
-
self . prompt_template )
+
self . example_num )
+
input_prompt = self . build_input ( random_qa_samples )
+
+
messages = [{
+
'role' : 'system' ,
+
'content' : self . system_prompt
+
}, {
+
'role' : 'user' ,
+
'content' : input_prompt
+
}]
+
if self . enable_vllm :
-
response = model . generate ([ input_prompt ], self . sampling_params )
-
response_str = response [ 0 ] . outputs [ 0 ] . text
+
response = model . chat ( messages , self . sampling_params )
+
output = response [ 0 ] . outputs [ 0 ] . text
else :
-
inputs = processor ( input_prompt ,
-
return_tensors = 'pt' ) . to ( model . device )
-
output_ids = model . generate ( ** inputs , ** self . sampling_params )
-
# remove the input prompt from the output
-
output_ids = output_ids [:, inputs . data [ 'input_ids' ] . shape [ 1 ]:]
-
response_str = processor . decode ( output_ids . cpu ()[ 0 ],
-
skip_special_tokens = True )
-
message_list = []
-
out_qa_pairs , response_str = self . parse_response ( response_str )
-
-
if not response_str :
-
return { self . text_key : json . dumps ({ 'messages' : message_list })}
+
# model is pipe
+
response = model ( messages ,
+
return_full_text = False ,
+
** self . sampling_params )
+
output = response [ 0 ][ 'generated_text' ]
+
+
output_qa_pairs = self . parse_output ( output )
+
if len ( output_qa_pairs ) == 0 :
+
logger . warning ( 'Parse model response error! '
+
'No data generated for the current response!' )
+
sample . update ({
+
self . query_key : '' ,
+
self . response_key : '' ,
+
self . history_key : self . empty_history ()
+
})
+
return sample
if self . similarity_type == 'rouge_l' :
-
sim_score = self . max_rouge_l_score ( response_str ,
-
self . reference_samples )
+
sim_score = self . _max_rouge_l_score ( output_qa_pairs ,
+
random_qa_samples )
else :
raise ValueError (
f 'Not support similarity type " { self . similarity_type } "!' )
if sim_score <= self . similarity_threshold :
-
for question , answer in out_qa_pairs :
-
message_list . append ({ 'role' : 'user' , 'content' : question })
-
message_list . append ({ 'role' : 'assistant' , 'content' : answer })
+
query , response = output_qa_pairs [ - 1 ]
+
history = output_qa_pairs [: - 1 ]
+
if len ( history ) == 0 :
+
history = self . empty_history ()
else :
+
query = response = ''
+
history = self . empty_history ()
logger . info ( 'Filter this generated sample due to similarity.' )
-
return {
-
self . text_key :
-
json . dumps ({ 'messages' : message_list }, ensure_ascii = False )
-
}
+
sample . update ({
+
self . query_key : query ,
+
self . response_key : response ,
+
self . history_key : history
+
})
+
return sample
diff --git a/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html
new file mode 100644
index 000000000..02571066e
--- /dev/null
+++ b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html
@@ -0,0 +1,255 @@
+
+
+
+
+
+
+
+
data_juicer.ops.mapper.generate_qa_from_text_mapper — data_juicer 0.2.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ data_juicer
+
+
+
+
+
+
+
+
+
Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper
+import re
+from typing import Dict , Optional
+
+from loguru import logger
+
+from data_juicer.ops.base_op import OPERATORS , UNFORKABLE , Mapper
+from data_juicer.utils.lazy_loader import LazyLoader
+from data_juicer.utils.model_utils import get_model , prepare_model
+
+torch = LazyLoader ( 'torch' , 'torch' )
+vllm = LazyLoader ( 'vllm' , 'vllm' )
+
+OP_NAME = 'generate_qa_from_text_mapper'
+
+
+# TODO: Extend LLM-based OPs into API-based implementation.
+[docs] @UNFORKABLE . register_module ( OP_NAME )
+
@OPERATORS . register_module ( OP_NAME )
+
class GenerateQAFromTextMapper ( Mapper ):
+
"""
+
Mapper to generate question and answer pairs from text.
+
Recommended model list: [
+
'alibaba-pai/pai-llama3-8b-doc2qa',
+
'alibaba-pai/pai-baichuan2-7b-doc2qa',
+
'alibaba-pai/pai-qwen1_5-4b-doc2qa',
+
'alibaba-pai/pai-qwen1_5-7b-doc2qa',
+
'alibaba-pai/pai-qwen1_5-1b8-doc2qa',
+
'alibaba-pai/pai-qwen1_5-0b5-doc2qa'
+
]
+
These recommended models are all trained with Chinese data
+
and are suitable for Chinese.
+
"""
+
+
_accelerator = 'cuda'
+
_batched_op = True
+
+
[docs] def __init__ ( self ,
+
hf_model : str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa' ,
+
* ,
+
output_pattern : Optional [ str ] = None ,
+
enable_vllm : bool = False ,
+
model_params : Optional [ Dict ] = None ,
+
sampling_params : Optional [ Dict ] = None ,
+
** kwargs ):
+
"""
+
Initialization method.
+
+
:param hf_model: Hugginface model ID.
+
:param output_pattern: Regular expression pattern to extract
+
questions and answers from model response.
+
:param enable_vllm: Whether to use vllm for inference acceleration.
+
:param model_params: Parameters for initializing the model.
+
:param sampling_params: Sampling parameters for text generation,
+
e.g {'temperature': 0.9, 'top_p': 0.95}
+
:param kwargs: Extra keyword arguments.
+
+
The default data format parsed by this interface is as follows:
+
Model Input:
+
蒙古国的首都是乌兰巴托(Ulaanbaatar)
+
冰岛的首都是雷克雅未克(Reykjavik)
+
Model Output:
+
蒙古国的首都是乌兰巴托(Ulaanbaatar)
+
冰岛的首都是雷克雅未克(Reykjavik)
+
Human: 请问蒙古国的首都是哪里?
+
Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。
+
Human: 冰岛的首都是哪里呢?
+
Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。
+
...
+
"""
+
+
super () . __init__ ( ** kwargs )
+
+
if output_pattern is None :
+
self . output_pattern = r 'Human:(.*?)Assistant:(.*?)(?=Human|$)' # noqa: E501
+
else :
+
self . output_pattern = output_pattern
+
+
self . enable_vllm = enable_vllm
+
model_params = model_params or {}
+
sampling_params = sampling_params or {}
+
+
if enable_vllm :
+
assert torch . cuda . device_count () >= 1 , 'must be executed in CUDA'
+
# cannot initialize vllm replicas on different GPUs
+
self . num_proc = 1
+
if model_params . get ( 'tensor_parallel_size' ) is None :
+
tensor_parallel_size = torch . cuda . device_count ()
+
logger . info ( f 'Set tensor_parallel_size to \
+
{ tensor_parallel_size } for vllm.' )
+
model_params [ 'tensor_parallel_size' ] = tensor_parallel_size
+
self . model_key = prepare_model (
+
model_type = 'vllm' ,
+
pretrained_model_name_or_path = hf_model ,
+
** model_params )
+
self . sampling_params = vllm . SamplingParams ( ** sampling_params )
+
else :
+
self . model_key = prepare_model (
+
model_type = 'huggingface' ,
+
pretrained_model_name_or_path = hf_model ,
+
return_pipe = True ,
+
** model_params )
+
self . sampling_params = sampling_params
+
+
[docs] def parse_output ( self , raw_output ):
+
logger . debug ( raw_output )
+
qa_list = []
+
matches = re . findall ( self . output_pattern , raw_output , re . DOTALL )
+
for match in matches :
+
user , assistant = match
+
qa_list . append (( user . strip (), assistant . strip ()))
+
return qa_list
+
+
[docs] def process_batched ( self , samples , rank = None ):
+
model , _ = get_model ( self . model_key , rank , self . use_cuda ())
+
+
input_keys = samples . keys ()
+
num_samples = len ( samples [ next ( iter ( input_keys ))])
+
output_keys = input_keys | { self . query_key , self . response_key }
+
output_samples = { key : [] for key in output_keys }
+
+
for i in range ( num_samples ):
+
messages = [{ 'role' : 'user' , 'content' : samples [ self . text_key ][ i ]}]
+
+
if self . enable_vllm :
+
response = model . chat ( messages , self . sampling_params )
+
output = response [ 0 ] . outputs [ 0 ] . text
+
else :
+
# model is pipe
+
response = model ( messages ,
+
return_full_text = False ,
+
** self . sampling_params )
+
output = response [ 0 ][ 'generated_text' ]
+
+
qa_list = self . parse_output ( output )
+
if len ( qa_list ) > 0 :
+
for q , a in qa_list :
+
for input_k in input_keys :
+
output_samples [ input_k ] . append ( samples [ input_k ][ i ])
+
output_samples [ self . query_key ] . append ( q )
+
output_samples [ self . response_key ] . append ( a )
+
else :
+
logger . warning (
+
'No question and answer was extracted from current sample!'
+
)
+
+
return output_samples
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html b/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html
deleted file mode 100644
index 9404f2336..000000000
--- a/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html
+++ /dev/null
@@ -1,224 +0,0 @@
-
-
-
-
-
-
-
-
data_juicer.ops.mapper.optimize_instruction_mapper — data_juicer 0.2.0 documentation
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- data_juicer
-
-
-
-
-
-
-
-
-
Source code for data_juicer.ops.mapper.optimize_instruction_mapper
-from typing import Dict , Optional
-
-from loguru import logger
-
-from data_juicer.ops.base_op import OPERATORS , UNFORKABLE , Mapper
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import get_model , prepare_model
-
-torch = LazyLoader ( 'torch' , 'torch' )
-vllm = LazyLoader ( 'vllm' , 'vllm' )
-
-DEFAULT_SYSTEM_PROMPT = '请优化这个指令,将其修改为一个更详细具体的指令。'
-
-OP_NAME = 'optimize_instruction_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-[docs] @UNFORKABLE . register_module ( OP_NAME )
-
@OPERATORS . register_module ( OP_NAME )
-
class OptimizeInstructionMapper ( Mapper ):
-
"""Mapper to optimize instruction.
-
Recommended model list: [
-
alibaba-pai/Qwen2-1.5B-Instruct-Refine
-
alibaba-pai/Qwen2-7B-Instruct-Refine
-
]
-
"""
-
_accelerator = 'cuda'
-
-
[docs] def __init__ ( self ,
-
hf_model : str = 'alibaba-pai/Qwen2-7B-Instruct-Refine' ,
-
trust_remote_code : bool = False ,
-
system_prompt : Optional [ str ] = None ,
-
enable_vllm : bool = True ,
-
tensor_parallel_size : Optional [ int ] = None ,
-
max_model_len : Optional [ int ] = None ,
-
max_num_seqs : int = 256 ,
-
sampling_params : Dict = {},
-
* args ,
-
** kwargs ):
-
"""
-
Initialization method.
-
:param hf_model: Hugginface model id.
-
:param trust_remote_code: passed to transformers
-
:param system_prompt: System prompt for optimize samples.
-
:param enable_vllm: Whether to use vllm for inference acceleration.
-
:param tensor_parallel_size: It is only valid when enable_vllm is True.
-
The number of GPUs to use for distributed execution with tensor
-
parallelism.
-
:param max_model_len: It is only valid when enable_vllm is True.
-
Model context length. If unspecified, will be automatically
-
derived from the model config.
-
:param max_num_seqs: It is only valid when enable_vllm is True.
-
Maximum number of sequences to be processed in a single iteration.
-
:param sampling_params: Sampling parameters for text generation.
-
e.g {'temperature': 0.9, 'top_p': 0.95}
-
:param args: extra args
-
:param kwargs: extra args
-
"""
-
super () . __init__ ( * args , ** kwargs )
-
self . num_proc = 1
-
-
if system_prompt is None :
-
system_prompt = DEFAULT_SYSTEM_PROMPT
-
self . system_prompt = system_prompt
-
self . enable_vllm = enable_vllm
-
-
if enable_vllm :
-
assert torch . cuda . device_count () >= 1 , 'must be executed in CUDA'
-
if not tensor_parallel_size :
-
tensor_parallel_size = torch . cuda . device_count ()
-
logger . info ( f 'Set tensor_parallel_size to \
-
{ tensor_parallel_size } for vllm.' )
-
self . model_key = prepare_model (
-
model_type = 'vllm' ,
-
pretrained_model_name_or_path = hf_model ,
-
trust_remote_code = trust_remote_code ,
-
tensor_parallel_size = tensor_parallel_size ,
-
max_model_len = max_model_len ,
-
max_num_seqs = max_num_seqs )
-
self . sampling_params = vllm . SamplingParams ( ** sampling_params )
-
else :
-
self . model_key = prepare_model (
-
model_type = 'huggingface' ,
-
pretrained_model_name_or_path = hf_model ,
-
trust_remote_code = trust_remote_code )
-
self . sampling_params = sampling_params
-
-
[docs] def process_single ( self , sample = None , rank = None ):
-
model , processor = get_model ( self . model_key , rank = rank )
-
-
messages = [{
-
'role' : 'system' ,
-
'content' : self . system_prompt
-
}, {
-
'role' : 'user' ,
-
'content' : sample [ self . text_key ]
-
}]
-
input_prompt = processor . apply_chat_template (
-
messages , tokenize = False , add_generation_prompt = True )
-
-
if self . enable_vllm :
-
response = model . generate ([ input_prompt ], self . sampling_params )
-
output = response [ 0 ] . outputs [ 0 ] . text
-
else :
-
inputs = processor ( input_prompt ,
-
return_tensors = 'pt' ) . to ( model . device )
-
response = model . generate ( ** inputs ,
-
eos_token_id = processor . eos_token_id ,
-
** self . sampling_params )
-
output = processor . decode ( response . cpu ()[ 0 ],
-
skip_special_tokens = True )
-
-
sample [ self . text_key ] = output
-
-
return sample
-
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html
new file mode 100644
index 000000000..a040d8438
--- /dev/null
+++ b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html
@@ -0,0 +1,254 @@
+
+
+
+
+
+
+
+
data_juicer.ops.mapper.optimize_qa_mapper — data_juicer 0.2.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ data_juicer
+
+
+
+
+
+
+
+
+
Source code for data_juicer.ops.mapper.optimize_qa_mapper
+import re
+from typing import Dict , Optional
+
+from loguru import logger
+
+from data_juicer.ops.base_op import OPERATORS , UNFORKABLE , Mapper
+from data_juicer.utils.lazy_loader import LazyLoader
+from data_juicer.utils.model_utils import get_model , prepare_model
+
+torch = LazyLoader ( 'torch' , 'torch' )
+vllm = LazyLoader ( 'vllm' , 'vllm' )
+
+OP_NAME = 'optimize_qa_mapper'
+
+
+# TODO: Extend LLM-based OPs into API-based implementation.
+[docs] @UNFORKABLE . register_module ( OP_NAME )
+
@OPERATORS . register_module ( OP_NAME )
+
class OptimizeQAMapper ( Mapper ):
+
"""
+
Mapper to optimize question-answer pairs.
+
"""
+
+
# avoid leading whitespace
+
DEFAULT_SYSTEM_PROMPT = ( '请优化输入的问答对,使【问题】和【回答】都更加详细、准确。'
+
'必须按照以下标记格式,直接输出优化后的问答对: \n '
+
'【问题】 \n '
+
'优化后的问题 \n '
+
'【回答】 \n '
+
'优化后的回答' )
+
DEFAULT_INPUT_TEMPLATE = '以下是原始问答对: \n {} '
+
DEFAULT_QA_PAIR_TEMPLATE = '【问题】 \n {} \n 【回答】 \n {} '
+
DEFAULT_OUTPUT_PATTERN = r '.*?【问题】\s*(.*?)\s*【回答】\s*(.*)'
+
+
_accelerator = 'cuda'
+
+
[docs] def __init__ ( self ,
+
hf_model : str = 'Qwen/Qwen2.5-7B-Instruct' ,
+
* ,
+
system_prompt : Optional [ str ] = None ,
+
input_template : Optional [ str ] = None ,
+
qa_pair_template : Optional [ str ] = None ,
+
output_pattern : Optional [ str ] = None ,
+
enable_vllm : bool = False ,
+
model_params : Optional [ Dict ] = None ,
+
sampling_params : Optional [ Dict ] = None ,
+
** kwargs ):
+
"""
+
Initialization method.
+
+
:param hf_model: Hugging Face model ID.
+
:param system_prompt: System prompt for guiding the optimization task.
+
:param input_template: Template for building the input for the model.
+
Please make sure the template contains one placeholder '{}', which
+
corresponds to the question and answer pair generated by
+
param `qa_pair_template`.
+
:param qa_pair_template: Template for formatting the question and
+
answer pair. Please make sure the template contains two
+
'{}' to format question and answer.
+
:param output_pattern: Regular expression pattern to extract question
+
and answer from model response.
+
:param enable_vllm: Whether to use VLLM for inference acceleration.
+
:param model_params: Parameters for initializing the model.
+
:param sampling_params: Sampling parameters for text generation (e.g.,
+
{'temperature': 0.9, 'top_p': 0.95}).
+
:param kwargs: Extra keyword arguments.
+
"""
+
super () . __init__ ( ** kwargs )
+
+
self . system_prompt = system_prompt or self . DEFAULT_SYSTEM_PROMPT
+
self . input_template = input_template or self . DEFAULT_INPUT_TEMPLATE
+
self . qa_pair_template = qa_pair_template or \
+
self . DEFAULT_QA_PAIR_TEMPLATE
+
self . output_pattern = output_pattern or self . DEFAULT_OUTPUT_PATTERN
+
+
self . enable_vllm = enable_vllm
+
model_params = model_params or {}
+
sampling_params = sampling_params or {}
+
+
if enable_vllm :
+
assert torch . cuda . device_count () >= 1 , 'must be executed in CUDA'
+
# cannot initialize vllm replicas on different GPUs
+
self . num_proc = 1
+
if model_params . get ( 'tensor_parallel_size' ) is None :
+
tensor_parallel_size = torch . cuda . device_count ()
+
logger . info ( f 'Set tensor_parallel_size to \
+
{ tensor_parallel_size } for vllm.' )
+
model_params [ 'tensor_parallel_size' ] = tensor_parallel_size
+
self . model_key = prepare_model (
+
model_type = 'vllm' ,
+
pretrained_model_name_or_path = hf_model ,
+
** model_params )
+
self . sampling_params = vllm . SamplingParams ( ** sampling_params )
+
else :
+
self . model_key = prepare_model (
+
model_type = 'huggingface' ,
+
pretrained_model_name_or_path = hf_model ,
+
return_pipe = True ,
+
** model_params )
+
self . sampling_params = sampling_params
+
+
+
+
[docs] def parse_output ( self , raw_output ):
+
logger . debug ( raw_output )
+
matches = re . findall ( self . output_pattern , raw_output , re . DOTALL )
+
if matches :
+
match = matches [ 0 ]
+
return match . group ( 1 ) . strip (), match . group ( 2 ) . strip ()
+
else :
+
return None , None
+
+
[docs] def process_single ( self , sample = None , rank = None ):
+
model , _ = get_model ( self . model_key , rank , self . use_cuda ())
+
+
input_prompt = self . build_input ( sample )
+
messages = [{
+
'role' : 'system' ,
+
'content' : self . system_prompt
+
}, {
+
'role' : 'user' ,
+
'content' : input_prompt
+
}]
+
+
if self . enable_vllm :
+
response = model . chat ( messages , self . sampling_params )
+
output = response [ 0 ] . outputs [ 0 ] . text
+
else :
+
# model is pipe
+
response = model ( messages ,
+
return_full_text = False ,
+
** self . sampling_params )
+
output = response [ 0 ][ 'generated_text' ]
+
+
parsed_q , parsed_a = self . parse_output ( output )
+
if parsed_q :
+
sample [ self . query_key ] = parsed_q
+
if parsed_a :
+
sample [ self . response_key ] = parsed_a
+
+
return sample
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_modules/data_juicer/ops/mapper/optimize_query_mapper.html b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html
new file mode 100644
index 000000000..ab704fc5a
--- /dev/null
+++ b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html
@@ -0,0 +1,129 @@
+
+
+
+
+
+
+
+
data_juicer.ops.mapper.optimize_query_mapper — data_juicer 0.2.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ data_juicer
+
+
+
+
+
+
+
+
+
Source code for data_juicer.ops.mapper.optimize_query_mapper
+from data_juicer.ops.base_op import OPERATORS , UNFORKABLE
+from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
+
+OP_NAME = 'optimize_query_mapper'
+
+
+# TODO: Extend LLM-based OPs into API-based implementation.
+[docs] @UNFORKABLE . register_module ( OP_NAME )
+
@OPERATORS . register_module ( OP_NAME )
+
class OptimizeQueryMapper ( OptimizeQAMapper ):
+
"""
+
Mapper to optimize query in question-answer pairs.
+
"""
+
+
DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。' # noqa: E501
+
+
_accelerator = 'cuda'
+
+
[docs] def parse_output ( self , raw_output ):
+
return raw_output . strip (), None
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_modules/data_juicer/ops/mapper/optimize_response_mapper.html b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html
new file mode 100644
index 000000000..22993ce81
--- /dev/null
+++ b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html
@@ -0,0 +1,129 @@
+
+
+
+
+
+
+
+
data_juicer.ops.mapper.optimize_response_mapper — data_juicer 0.2.0 documentation
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ data_juicer
+
+
+
+
+
+
+
+
+
Source code for data_juicer.ops.mapper.optimize_response_mapper
+from data_juicer.ops.base_op import OPERATORS , UNFORKABLE
+from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
+
+OP_NAME = 'optimize_response_mapper'
+
+
+# TODO: Extend LLM-based OPs into API-based implementation.
+[docs] @UNFORKABLE . register_module ( OP_NAME )
+
@OPERATORS . register_module ( OP_NAME )
+
class OptimizeResponseMapper ( OptimizeQAMapper ):
+
"""
+
Mapper to optimize response in question-answer pairs.
+
"""
+
+
DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。'
+
+
_accelerator = 'cuda'
+
+
[docs] def parse_output ( self , raw_output ):
+
return None , raw_output . strip ()
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/_modules/index.html b/_modules/index.html
index a553f9a40..7df789de5 100644
--- a/_modules/index.html
+++ b/_modules/index.html
@@ -159,9 +159,9 @@
All modules for which code is available
data_juicer.ops.mapper.clean_ip_mapper
data_juicer.ops.mapper.clean_links_mapper
data_juicer.ops.mapper.expand_macro_mapper
-
data_juicer.ops.mapper.extract_qa_mapper
data_juicer.ops.mapper.fix_unicode_mapper
-
data_juicer.ops.mapper.generate_instruction_mapper
+
data_juicer.ops.mapper.generate_qa_from_examples_mapper
+
data_juicer.ops.mapper.generate_qa_from_text_mapper
data_juicer.ops.mapper.image_blur_mapper
data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper
data_juicer.ops.mapper.image_captioning_mapper
@@ -170,7 +170,9 @@
All modules for which code is available
data_juicer.ops.mapper.image_tagging_mapper
data_juicer.ops.mapper.nlpaug_en_mapper
data_juicer.ops.mapper.nlpcda_zh_mapper
-
data_juicer.ops.mapper.optimize_instruction_mapper
+
data_juicer.ops.mapper.optimize_qa_mapper
+
data_juicer.ops.mapper.optimize_query_mapper
+
data_juicer.ops.mapper.optimize_response_mapper
data_juicer.ops.mapper.punctuation_normalization_mapper
data_juicer.ops.mapper.remove_bibliography_mapper
data_juicer.ops.mapper.remove_comments_mapper
diff --git a/data_juicer.ops.deduplicator.html b/data_juicer.ops.deduplicator.html
index b3126bfce..34c7a68be 100644
--- a/data_juicer.ops.deduplicator.html
+++ b/data_juicer.ops.deduplicator.html
@@ -47,15 +47,15 @@
data_juicer.ops.filter
data_juicer.ops.mapper
data_juicer.ops.deduplicator
data_juicer.ops.selector
@@ -92,44 +92,45 @@
data_juicer.ops.deduplicator
-
-class data_juicer.ops.deduplicator. VideoDeduplicator ( consider_text : bool = False , * args , ** kwargs ) [source]
+
+class data_juicer.ops.deduplicator. DocumentDeduplicator ( lowercase : bool = False , ignore_non_character : bool = False , * args , ** kwargs ) [source]
Bases: Deduplicator
-Deduplicator to deduplicate samples at document-level using exact matching
-of videos between documents.
+Deduplicator to deduplicate samples at document-level using exact matching.
+Using md5 hash to deduplicate samples.
-
-__init__ ( consider_text : bool = False , * args , ** kwargs ) [source]
-Initialization.
+
+__init__ ( lowercase : bool = False , ignore_non_character : bool = False , * args , ** kwargs ) [source]
+Initialization method.
Parameters:
-consider_text – whether to consider text hash together with video
-hash when applying deduplication.
+lowercase – Whether to convert sample text to lower case
+ignore_non_character – Whether to ignore non-alphabet
+characters, including whitespaces, digits, and punctuations
args – extra args
-kwargs – extra args
+kwargs – extra args.
-
-compute_hash ( sample , context = False ) [source]
-Compute hash values for the sample.
+
+compute_hash ( sample ) [source]
+Compute md5 hash values for the sample.
Parameters:
sample – input sample
Returns:
-sample with computed hash value.
+sample with md5 hash value.
-
-process ( dataset , show_num = 0 ) [source]
+
+process ( dataset , show_num = 0 ) [source]
For doc-level, dataset –> dataset.
Parameters:
@@ -147,69 +148,6 @@
-
-
-class data_juicer.ops.deduplicator. RayBasicDeduplicator ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , * args , ** kwargs ) [source]
-Bases: Filter
-A basic exact matching deduplicator for RAY.
-Although its functionality is deduplication,
-it is implemented as Filter sub-class.
-
-
-EMPTY_HASH_VALUE = 'EMPTY'
-
-
-
-
-__init__ ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , * args , ** kwargs ) [source]
-Initialization.
-:param redis_host: the hostname of redis server
-:param redis_port: the port of redis server
-:param args: extra args
-:param kwargs: extra args
-
-
-
-
-calculate_hash ( sample , context = False ) [source]
-Calculate hash value for the sample.
-
-
-
-
-compute_stats_single ( sample , context = False ) [source]
-Compute stats for the sample which is used as a metric to decide
-whether to filter this sample.
-
-Parameters:
-
-
-Returns:
-sample with computed stats
-
-
-
-
-
-
-process_single ( sample ) [source]
-For sample level, sample –> Boolean.
-
-Parameters:
-sample – sample to decide whether to filter
-
-Returns:
-true for keeping and false for filtering
-
-
-
-
-
-
class data_juicer.ops.deduplicator. DocumentMinhashDeduplicator ( tokenization : str = 'space' , window_size : int [ int ] = 5 , lowercase : bool = True , ignore_pattern : str | None = None , num_permutations : int [ int ] = 256 , jaccard_threshold : float [ float ] = 0.7 , num_bands : int [ int ] | None = None , num_rows_per_band : int [ int ] | None = None , tokenizer_model : str | None = None , * args , ** kwargs ) [source]
@@ -291,95 +229,54 @@
-
-class data_juicer.ops.deduplicator. RayImageDeduplicator ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , method : str = 'phash' , * args , ** kwargs ) [source]
-Bases: RayBasicDeduplicator
-Deduplicator to deduplicate samples at document-level using exact matching
-of images between documents.
-
-
-__init__ ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , method : str = 'phash' , * args , ** kwargs ) [source]
-Initialization.
-:param redis_host: the hostname of redis server
-:param redis_port: the port of redis server
-:param args: extra args
-:param kwargs: extra args
-
-
-
-
-calculate_hash ( sample , context = False ) [source]
-Calculate hash value for the sample.
-
-
-
-
-
-
-class data_juicer.ops.deduplicator. RayDocumentDeduplicator ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , lowercase : bool = False , ignore_non_character : bool = False , * args , ** kwargs ) [source]
-Bases: RayBasicDeduplicator
-Deduplicator to deduplicate samples at document-level using exact matching.
-
-
-__init__ ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , lowercase : bool = False , ignore_non_character : bool = False , * args , ** kwargs ) [source]
-Initialization method.
-:param redis_host: the hostname of redis server
-:param redis_port: the port of redis server
-:param lowercase: Whether to convert sample text to lower case
-:param ignore_non_character: Whether to ignore non-alphabet
-characters, including whitespaces, digits, and punctuations
-:param args: extra args
-:param kwargs: extra args.
-
-
-
-
-calculate_hash ( sample , context = False ) [source]
-Calculate hash value for the sample.
-
-
-
-
-
-
-class data_juicer.ops.deduplicator. DocumentDeduplicator ( lowercase : bool = False , ignore_non_character : bool = False , * args , ** kwargs ) [source]
+
+class data_juicer.ops.deduplicator. DocumentSimhashDeduplicator ( tokenization : str = 'space' , window_size : int [ int ] = 6 , lowercase : bool = True , ignore_pattern : str | None = None , num_blocks : int [ int ] = 6 , hamming_distance : int [ int ] = 4 , * args , ** kwargs ) [source]
Bases: Deduplicator
-Deduplicator to deduplicate samples at document-level using exact matching.
-Using md5 hash to deduplicate samples.
+Deduplicator to deduplicate samples at document-level using SimHash.
-
-__init__ ( lowercase : bool = False , ignore_non_character : bool = False , * args , ** kwargs ) [source]
-Initialization method.
+
+__init__ ( tokenization : str = 'space' , window_size : int [ int ] = 6 , lowercase : bool = True , ignore_pattern : str | None = None , num_blocks : int [ int ] = 6 , hamming_distance : int [ int ] = 4 , * args , ** kwargs ) [source]
+Initialization method :param tokenization: tokenization method for
+sample texts.
+It should be one of [space, punctuation, character]. For
+English-like languages, we recommend to use ‘space’. And for
+Chinese-like languages, we recommend to use ‘character’
Parameters:
-lowercase – Whether to convert sample text to lower case
-ignore_non_character – Whether to ignore non-alphabet
-characters, including whitespaces, digits, and punctuations
-args – extra args
-kwargs – extra args.
+window_size – window size of shingling
+lowercase – whether to convert text to lower case first
+ignore_pattern – whether to ignore sub-strings with
+specific pattern when computing simhash
+num_blocks – number of blocks in simhash computing
+hamming_distance – the max hamming distance threshold in
+near-duplicate detection. When the hamming distance of two
+sample texts is <= this threshold, they are regarded as
+similar samples and this op will only keep one of them after
+deduplication. This threshold should be always less than
+num_blocks
-
-compute_hash ( sample ) [source]
-Compute md5 hash values for the sample.
+
+compute_hash ( sample ) [source]
+Compute simhash values for the sample.
Parameters:
sample – input sample
Returns:
-sample with md5 hash value.
+sample with simhash value.
-
-process ( dataset , show_num = 0 ) [source]
+
+process ( dataset , show_num = 0 ) [source]
For doc-level, dataset –> dataset.
Parameters:
@@ -455,71 +352,118 @@
-
-class data_juicer.ops.deduplicator. DocumentSimhashDeduplicator ( tokenization : str = 'space' , window_size : int [ int ] = 6 , lowercase : bool = True , ignore_pattern : str | None = None , num_blocks : int [ int ] = 6 , hamming_distance : int [ int ] = 4 , * args , ** kwargs ) [source]
-Bases: Deduplicator
-Deduplicator to deduplicate samples at document-level using SimHash.
+
+class data_juicer.ops.deduplicator. RayBasicDeduplicator ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , * args , ** kwargs ) [source]
+Bases: Filter
+A basic exact matching deduplicator for RAY.
+Although its functionality is deduplication,
+it is implemented as Filter sub-class.
+
+
+EMPTY_HASH_VALUE = 'EMPTY'
+
+
-
-__init__ ( tokenization : str = 'space' , window_size : int [ int ] = 6 , lowercase : bool = True , ignore_pattern : str | None = None , num_blocks : int [ int ] = 6 , hamming_distance : int [ int ] = 4 , * args , ** kwargs ) [source]
-Initialization method :param tokenization: tokenization method for
-sample texts.
-It should be one of [space, punctuation, character]. For
-English-like languages, we recommend to use ‘space’. And for
-Chinese-like languages, we recommend to use ‘character’
-
-Parameters:
-
-window_size – window size of shingling
-lowercase – whether to convert text to lower case first
-ignore_pattern – whether to ignore sub-strings with
-specific pattern when computing simhash
-num_blocks – number of blocks in simhash computing
-hamming_distance – the max hamming distance threshold in
-near-duplicate detection. When the hamming distance of two
-sample texts is <= this threshold, they are regarded as
-similar samples and this op will only keep one of them after
-deduplication. This threshold should be always less than
-num_blocks
-
-
-
+
+__init__ ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , * args , ** kwargs ) [source]
+Initialization.
+:param redis_host: the hostname of redis server
+:param redis_port: the port of redis server
+:param args: extra args
+:param kwargs: extra args
-
-compute_hash ( sample ) [source]
-Compute simhash values for the sample.
+
+calculate_hash ( sample , context = False ) [source]
+Calculate hash value for the sample.
+
+
+
+
+compute_stats_single ( sample , context = False ) [source]
+Compute stats for the sample which is used as a metric to decide
+whether to filter this sample.
Parameters:
-sample – input sample
+
Returns:
-sample with simhash value.
+sample with computed stats
-
-process ( dataset , show_num = 0 ) [source]
-For doc-level, dataset –> dataset.
+
+process_single ( sample ) [source]
+For sample level, sample –> Boolean.
Parameters:
-
+sample – sample to decide whether to filter
Returns:
-deduplicated dataset and the sampled duplicate pairs.
+true for keeping and false for filtering
+
+
+class data_juicer.ops.deduplicator. RayDocumentDeduplicator ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , lowercase : bool = False , ignore_non_character : bool = False , * args , ** kwargs ) [source]
+Bases: RayBasicDeduplicator
+Deduplicator to deduplicate samples at document-level using exact matching.
+
+
+__init__ ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , lowercase : bool = False , ignore_non_character : bool = False , * args , ** kwargs ) [source]
+Initialization method.
+:param redis_host: the hostname of redis server
+:param redis_port: the port of redis server
+:param lowercase: Whether to convert sample text to lower case
+:param ignore_non_character: Whether to ignore non-alphabet
+characters, including whitespaces, digits, and punctuations
+:param args: extra args
+:param kwargs: extra args.
+
+
+
+
+calculate_hash ( sample , context = False ) [source]
+Calculate hash value for the sample.
+
+
+
+
+
+
+class data_juicer.ops.deduplicator. RayImageDeduplicator ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , method : str = 'phash' , * args , ** kwargs ) [source]
+Bases: RayBasicDeduplicator
+Deduplicator to deduplicate samples at document-level using exact matching
+of images between documents.
+
+
+__init__ ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , method : str = 'phash' , * args , ** kwargs ) [source]
+Initialization.
+:param redis_host: the hostname of redis server
+:param redis_port: the port of redis server
+:param args: extra args
+:param kwargs: extra args
+
+
+
+
+calculate_hash ( sample , context = False ) [source]
+Calculate hash value for the sample.
+
+
+
+
class data_juicer.ops.deduplicator. RayVideoDeduplicator ( redis_host : str = 'localhost' , redis_port : int [ int ] = 6380 , * args , ** kwargs ) [source]
@@ -544,6 +488,62 @@
+
+
+class data_juicer.ops.deduplicator. VideoDeduplicator ( consider_text : bool = False , * args , ** kwargs ) [source]
+Bases: Deduplicator
+Deduplicator to deduplicate samples at document-level using exact matching
+of videos between documents.
+
+
+__init__ ( consider_text : bool = False , * args , ** kwargs ) [source]
+Initialization.
+
+Parameters:
+
+
+
+
+
+
+
+compute_hash ( sample , context = False ) [source]
+Compute hash values for the sample.
+
+Parameters:
+sample – input sample
+
+Returns:
+sample with computed hash value.
+
+
+
+
+
+
+process ( dataset , show_num = 0 ) [source]
+For doc-level, dataset –> dataset.
+
+Parameters:
+
+
+Returns:
+deduplicated dataset and the sampled duplicate pairs.
+
+
+
+
+
+
diff --git a/data_juicer.ops.filter.html b/data_juicer.ops.filter.html
index 81afd97db..b9c1f1dd0 100644
--- a/data_juicer.ops.filter.html
+++ b/data_juicer.ops.filter.html
@@ -45,49 +45,49 @@
data_juicer.core
data_juicer.ops
data_juicer.ops.filter
data_juicer.ops.mapper
@@ -126,33 +126,28 @@
data_juicer.ops.filter
-
-class data_juicer.ops.filter. ImageTextSimilarityFilter ( hf_clip : str = 'openai/clip-vit-base-patch32' , trust_remote_code : bool = False , min_score : float = 0.1 , max_score : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. AlphanumericFilter ( tokenization : bool = False , min_ratio : float = 0.25 , max_ratio : float = 9223372036854775807 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples those similarities between image and text
-within a specific range.
+Filter to keep samples with alphabet/numeric ratio within a specific
+range.
-
-__init__ ( hf_clip : str = 'openai/clip-vit-base-patch32' , trust_remote_code : bool = False , min_score : float = 0.1 , max_score : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
+
+__init__ ( tokenization : bool = False , min_ratio : float = 0.25 , max_ratio : float = 9223372036854775807 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_clip – clip model name on huggingface to compute
-the similarity between image and text.
-min_score – The min similarity to keep samples.
-max_score – The max similarity to keep samples.
-horizontal_flip – Flip image horizontally (left to right).
-vertical_flip – Flip image vertically (top to bottom).
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
-condition.
-reduce_mode – reduce mode when one text corresponds to
-multiple images in a chunk.
-‘avg’: Take the average of multiple values
-‘max’: Take the max of multiple values
-‘min’: Take the min of multiple values
+tokenization – Whether to count the ratio of alphanumeric
+to the total number of tokens. if tokenization=False, it
+will count the ratio of alphanumeric to the total number of
+characters.
+min_ratio – The min filter ratio in alphanumeric op,
+samples will be filtered if their alphabet/numeric ratio is
+below this parameter.
+max_ratio – The max filter ratio in alphanumeric op,
+samples will be filtered if their alphabet/numeric ratio
+exceeds this parameter.
args – extra args
kwargs – extra args
@@ -161,60 +156,36 @@
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
-Compute stats for the sample which is used as a metric to decide
-whether to filter this sample.
-
-Parameters:
-
-
-Returns:
-sample with computed stats
-
-
-
+
+compute_stats_batched ( samples ) [source]
+
-
-process_single ( sample , rank = None ) [source]
-For sample level, sample –> Boolean.
-
-Parameters:
-sample – sample to decide whether to filter
-
-Returns:
-true for keeping and false for filtering
-
-
-
+
+process_batched ( samples ) [source]
+
-
-class data_juicer.ops.filter. VideoAspectRatioFilter ( min_ratio : str = '9/21' , max_ratio : str = '21/9' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. AudioDurationFilter ( min_duration : int = 0 , max_duration : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with video aspect ratio within a specific range.
-AspectRatio = W / H.
+Keep data samples whose audios’ durations are within a specified range.
-
-__init__ ( min_ratio : str = '9/21' , max_ratio : str = '21/9' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( min_duration : int = 0 , max_duration : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_ratio – The minimum aspect ratio to keep samples,
-supported format is a string, such as “9:21” or “9/21”.
-max_ratio – The maximum aspect ratio to keep samples,
-supported format is a string, such as “21:9” or “21/9”.
+min_duration – The min audio duration to keep samples in seconds.
+It’s 0 by default.
+max_duration – The max audio duration to keep samples in seconds.
+It’s sys.maxsize by default.
any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
+all audios. ‘any’: keep this sample if any audios meet the
+condition. ‘all’: keep this sample only if all audios meet the
condition.
args – extra args
kwargs – extra args
@@ -224,8 +195,8 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -243,8 +214,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -259,33 +230,28 @@
-
-class data_juicer.ops.filter. ImageTextMatchingFilter ( hf_blip : str = 'Salesforce/blip-itm-base-coco' , trust_remote_code : bool = False , min_score : float = 0.003 , max_score : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. AudioNMFSNRFilter ( min_snr : float = 0 , max_snr : float = 9223372036854775807 , nmf_iter_num : int [ int ] = 500 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples those matching score between image and text
-within a specific range.
+Keep data samples whose audios’ SNRs (computed based on NMF) are within
+a specified range.
-
-__init__ ( hf_blip : str = 'Salesforce/blip-itm-base-coco' , trust_remote_code : bool = False , min_score : float = 0.003 , max_score : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
+
+__init__ ( min_snr : float = 0 , max_snr : float = 9223372036854775807 , nmf_iter_num : int [ int ] = 500 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_blip – blip model name on huggingface to compute
-the matching score between image and text.
-min_score – The min matching score to keep samples.
-max_score – The max matching score to keep samples.
-horizontal_flip – Flip image horizontally (left to right).
-vertical_flip – Flip image vertically (top to bottom).
+min_snr – The min audio SNR to keep samples in dB. It’s 0 by
+default.
+max_snr – The max audio SNR to keep samples in dB. It’s
+sys.maxsize by default.
+nmf_iter_num – The max number of iterations to run NMF. It’s 500
+in default.
any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
+all audios. ‘any’: keep this sample if any audios meet the
+condition. ‘all’: keep this sample only if all audios meet the
condition.
-reduce_mode – reduce mode when one text corresponds to
-multiple images in a chunk.
-‘avg’: Take the average of multiple values
-‘max’: Take the max of multiple values
-‘min’: Take the min of multiple values
args – extra args
kwargs – extra args
@@ -294,8 +260,8 @@
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -313,8 +279,8 @@
-
-process_single ( sample , rank = None ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -329,24 +295,25 @@
-
-class data_juicer.ops.filter. ImageNSFWFilter ( hf_nsfw_model : str = 'Falconsai/nsfw_image_detection' , trust_remote_code : bool = False , score_threshold : float = 0.5 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. AudioSizeFilter ( min_size : str = '0' , max_size : str = '1TB' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples whose images have low nsfw scores.
+Keep data samples whose audio size (in bytes/kb/MB/…) within a
+specific range.
-
-__init__ ( hf_nsfw_model : str = 'Falconsai/nsfw_image_detection' , trust_remote_code : bool = False , score_threshold : float = 0.5 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( min_size : str = '0' , max_size : str = '1TB' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_nsfw_model – nsfw detection model name on huggingface.
-score_threshold – the nsfw score threshold for samples.
-range from 0 to 1. Samples with nsfw score less than this threshold
-will be kept.
+min_size – The min audio size to keep samples. set to be “0” by
+default for no size constraint
+max_size – The max audio size to keep samples. set to be
+“1Tb” by default, an approximate for un-limited case
any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
+all audios. ‘any’: keep this sample if any audios meet the
+condition. ‘all’: keep this sample only if all audios meet the
condition.
args – extra args
kwargs – extra args
@@ -356,8 +323,8 @@
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -375,8 +342,8 @@
-
-process_single ( sample , rank = None ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -391,24 +358,23 @@
-
-class data_juicer.ops.filter. TokenNumFilter ( hf_tokenizer : str = 'EleutherAI/pythia-6.9b-deduped' , min_num : int = 10 , max_num : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. AverageLineLengthFilter ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with total token number within a specific
+
Filter to keep samples with average line length within a specific
range.
-
-__init__ ( hf_tokenizer : str = 'EleutherAI/pythia-6.9b-deduped' , min_num : int = 10 , max_num : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+__init__ ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_tokenizer – the tokenizer name of Hugging Face tokenizers.
-min_num – The min filter token number in this op, samples
-will be filtered if their token number is below this
+
min_len – The min filter length in this op, samples will
+be filtered if their average line length is below this
parameter.
-max_num – The max filter token number in this op, samples
-will be filtered if their token number exceeds this
+
max_len – The max filter length in this op, samples will
+be filtered if their average line length exceeds this
parameter.
args – extra args
kwargs – extra args
@@ -418,59 +384,37 @@
-
-compute_stats_single ( sample ) [source]
-Compute stats for the sample which is used as a metric to decide
-whether to filter this sample.
-
-Parameters:
-
-
-Returns:
-sample with computed stats
-
-
-
+
+compute_stats_batched ( samples , context = False ) [source]
+
-
-process_single ( sample ) [source]
-For sample level, sample –> Boolean.
-
-Parameters:
-sample – sample to decide whether to filter
-
-Returns:
-true for keeping and false for filtering
-
-
-
+
+process_batched ( samples ) [source]
+
-
-class data_juicer.ops.filter. TextLengthFilter ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. CharacterRepetitionFilter ( rep_len : int [ int ] = 10 , min_ratio : float = 0.0 , max_ratio : float = 0.5 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with total text length within a specific
-range.
+Filter to keep samples with char-level n-gram repetition ratio within a
+specific range.
-
-__init__ ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+__init__ ( rep_len : int [ int ] = 10 , min_ratio : float = 0.0 , max_ratio : float = 0.5 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_len – The min text length in the filtering. samples
-will be filtered if their text length is below this
-parameter.
-max_len – The max text length in the filtering. samples
-will be filtered if their text length exceeds this
-parameter.
+rep_len – Repetition length for char-level n-gram.
+min_ratio – The min filter ratio in this op, samples will
+be filtered if their char-level n-gram repetition ratio is
+below this parameter.
+max_ratio – The max filter ratio in this op, samples will
+be filtered if their char-level n-gram repetition ratio
+exceeds this parameter.
args – extra args
kwargs – extra args
@@ -479,41 +423,43 @@
-
-compute_stats_batched ( samples ) [source]
+
+compute_stats_batched ( samples ) [source]
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.filter. SpecifiedNumericFieldFilter ( field_key : str = '' , min_value : float = -9223372036854775807 , max_value : float = 9223372036854775807 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. FlaggedWordFilter ( lang : str = 'en' , tokenization : bool = False , max_ratio : float = 0.045 , flagged_words_dir : str = '/home/runner/.cache/data_juicer/assets' , use_words_aug : bool = False , words_aug_group_sizes : List [ int [ int ] ] = [2] , words_aug_join_char : str = '' , * args , ** kwargs ) [source]
Bases: Filter
-Filter based on specified numeric field information.
-If the specified numeric information in the sample is not within the
-specified range, the sample will be filtered.
+Filter to keep samples with flagged-word ratio less than a specific max
+value.
-
-__init__ ( field_key : str = '' , min_value : float = -9223372036854775807 , max_value : float = 9223372036854775807 , * args , ** kwargs ) [source]
+
+__init__ ( lang : str = 'en' , tokenization : bool = False , max_ratio : float = 0.045 , flagged_words_dir : str = '/home/runner/.cache/data_juicer/assets' , use_words_aug : bool = False , words_aug_group_sizes : List [ int [ int ] ] = [2] , words_aug_join_char : str = '' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-field_key – Filter based on the specified numeric value
-corresponding to the target key. The target key
-corresponding to multi-level field information need to be
-separated by ‘.’.
-min_value – The min filter value in SpecifiedNumericField
-op, samples will be filtered if their specified numeric
-field value is below this parameter.
-max_value – The max filter value in SpecifiedNumericField
-op, samples will be filtered if their specified numeric
-field value exceeds this parameter.
+lang – Consider flagged words in what language. If lang ==
+“all”, we will adopt the one merged from all the available
+languages
+tokenization – Whether to use model to tokenize documents
+max_ratio – The max filter ratio in this op.
+flagged_words_dir – The directory storing the
+flagged_words file(s) whose name includes “flagged_words”
+and in json format
+use_words_aug – Whether to augment words, especially for
+Chinese and Vietnamese
+words_aug_group_sizes – The group size of words to augment
+words_aug_join_char – The join char between words to
+augment
args – extra args
kwargs – extra args
@@ -522,8 +468,8 @@
-
-compute_stats_single ( sample ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -541,8 +487,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -557,38 +503,37 @@
-
-class data_juicer.ops.filter. AudioNMFSNRFilter ( min_snr : float = 0 , max_snr : float = 9223372036854775807 , nmf_iter_num : int [ int ] = 500 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImageAestheticsFilter ( hf_scorer_model : str = '' , trust_remote_code : bool = False , min_score : float = 0.5 , max_score : float = 1.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Keep data samples whose audios’ SNRs (computed based on NMF) are within
-a specified range.
+Filter to keep samples with aesthetics scores within a specific range.
-
-__init__ ( min_snr : float = 0 , max_snr : float = 9223372036854775807 , nmf_iter_num : int [ int ] = 500 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( hf_scorer_model : str = '' , trust_remote_code : bool = False , min_score : float = 0.5 , max_score : float = 1.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_snr – The min audio SNR to keep samples in dB. It’s 0 by
-default.
-max_snr – The max audio SNR to keep samples in dB. It’s
-sys.maxsize by default.
-nmf_iter_num – The max number of iterations to run NMF. It’s 500
-in default.
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all audios. ‘any’: keep this sample if any audios meet the
-condition. ‘all’: keep this sample only if all audios meet the
+
hf_scorer_model – Huggingface model name for the aesthetics
+predictor. By default, we will use
+‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’,
+refer to pypi.org/project/simple-aesthetics-predictor
+min_score – Min score for the predicted aesthetics in an image.
+max_score – Max score for the predicted aesthetics in an image.
+any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
condition.
-args – extra args
-kwargs – extra args
+args – Extra positional arguments.
+kwargs – Extra keyword arguments.
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -606,8 +551,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -622,57 +567,34 @@
-
-class data_juicer.ops.filter. VideoAestheticsFilter ( hf_scorer_model : str = '' , trust_remote_code : bool = False , min_score : float = 0.4 , max_score : float = 1.0 , frame_sampling_method : str = 'uniform' , frame_num : int [ int ] = 3 , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImageAspectRatioFilter ( min_ratio : float = 0.333 , max_ratio : float = 3.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep data samples with aesthetics scores for specified frames
-in the videos within a specific range.
+Filter to keep samples with image aspect ratio within a specific range.
+AspectRatio = W / H.
-
-__init__ ( hf_scorer_model : str = '' , trust_remote_code : bool = False , min_score : float = 0.4 , max_score : float = 1.0 , frame_sampling_method : str = 'uniform' , frame_num : int [ int ] = 3 , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
+
+__init__ ( min_ratio : float = 0.333 , max_ratio : float = 3.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_scorer_model – Huggingface model name for the aesthetics
-predictor. By default, we will use
-‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’,
-refer to pypi.org/project/simple-aesthetics-predictor
-min_score – Min score for the predicted aesthetics in a video.
-max_score – Max score for the predicted aesthetics in a video.
-frame_sampling_method – sampling method of extracting frame
-images from the videos.
-Should be one of [“all_keyframes”, “uniform”].
-The former one extracts all key frames and the latter one extract
-specified number of frames uniformly from the video.
-Default: “uniform” with frame_num=3, considering that the number of
-keyframes can be large while their difference is usually small
-in terms of their aesthetics.
-frame_num – the number of frames to be extracted uniformly from
-the video. Only works when frame_sampling_method is “uniform”. If
-it’s 1, only the middle frame will be extracted. If it’s 2, only
-the first and the last frames will be extracted. If it’s larger
-than 2, in addition to the first and the last frames, other frames
-will be extracted uniformly within the video duration.
-any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
+
min_ratio – The min aspect ratio to keep samples.
+max_ratio – The max aspect ratio to keep samples.
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
condition.
-reduce_mode – reduce mode when one sample corresponds to
-multiple frames, must be one of [‘avg’,’max’, ‘min’].
-‘avg’: Take the average of multiple values
-‘max’: Take the max of multiple values
-‘min’: Take the min of multiple values
-args – Extra positional arguments.
-kwargs – Extra keyword arguments.
+args – extra args
+kwargs – extra args
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -690,8 +612,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -706,89 +628,97 @@
-
-class data_juicer.ops.filter. PerplexityFilter ( lang : str = 'en' , max_ppl : float = 1500 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImageFaceCountFilter ( cv_classifier : str = '' , min_face_count : int = 1 , max_face_count : int = 1 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with perplexity score less than a specific max
-value.
+Filter to keep samples with the number of faces within a specific range.
-
-__init__ ( lang : str = 'en' , max_ppl : float = 1500 , * args , ** kwargs ) [source]
+
+__init__ ( cv_classifier : str = '' , min_face_count : int = 1 , max_face_count : int = 1 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-lang – Compute perplexity for samples in which language.
-max_ppl – The max filter perplexity in this op, samples
-will be filtered if their perplexity exceeds this parameter.
-args – extra args
-kwargs – extra args
+cv_classifier – OpenCV classifier path for face detection.
+By default, we will use ‘haarcascade_frontalface_alt.xml’.
+min_face_count – Minimum number of faces required for samples.
+max_face_count – Maximum number of faces required for samples.
+any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
+condition.
+args – Extra positional arguments.
+kwargs – Extra keyword arguments.
-
-compute_stats_batched ( samples , context = False ) [source]
-
-
-
-
-process_batched ( samples ) [source]
-
-
-
-
-
-
-class data_juicer.ops.filter. PhraseGroundingRecallFilter ( hf_owlvit : str = 'google/owlvit-base-patch32' , trust_remote_code : bool = False , min_recall : float = 0.1 , max_recall : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , iou_thr : float = 0.5 , large_area_ratio_thr : float = 0.95 , conf_thr : float = 0.0 , * args , ** kwargs ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
+Compute stats for the sample which is used as a metric to decide
+whether to filter this sample.
+
+Parameters:
+
+
+Returns:
+sample with computed stats
+
+
+
+
+
+
+process_single ( sample ) [source]
+For sample level, sample –> Boolean.
+
+Parameters:
+sample – sample to decide whether to filter
+
+Returns:
+true for keeping and false for filtering
+
+
+
+
+
+
+
+
+class data_juicer.ops.filter. ImageFaceRatioFilter ( cv_classifier : str = '' , min_ratio : float = 0.0 , max_ratio : float = 0.4 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples whose locating recalls of phrases extracted
-from text in the images are within a specified range.
+Filter to keep samples with face area ratios within a specific range.
-
-__init__ ( hf_owlvit : str = 'google/owlvit-base-patch32' , trust_remote_code : bool = False , min_recall : float = 0.1 , max_recall : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , iou_thr : float = 0.5 , large_area_ratio_thr : float = 0.95 , conf_thr : float = 0.0 , * args , ** kwargs ) [source]
+
+__init__ ( cv_classifier : str = '' , min_ratio : float = 0.0 , max_ratio : float = 0.4 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_owlvit – Owl-ViT model name on huggingface to locate the
-phrases extracted from the text.
-min_recall – The min phrase grounding recall to keep samples.
-max_recall – The max phrase grounding recall to keep samples.
-horizontal_flip – Flip image horizontally (left to right).
-vertical_flip – Flip image vertically (top to bottom).
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+
cv_classifier – OpenCV classifier path for face detection.
+By default, we will use ‘haarcascade_frontalface_alt.xml’.
+min_ratio – Min ratio for the largest face area in an image.
+max_ratio – Max ratio for the largest face area in an image.
+any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of
all images. ‘any’: keep this sample if any images meet the
condition. ‘all’: keep this sample only if all images meet the
condition.
-reduce_mode – reduce mode when one text corresponds to
-multiple images in a chunk.
-‘avg’: Take the average of multiple values
-‘max’: Take the max of multiple values
-‘min’: Take the min of multiple values
-iou_thr – the IoU threshold for NMS-like post-process. If two
-predicted bboxes are overlap with an IoU larger than this
-threshold, the bbox with less confidence will be removed. Default:
-0.5.
-large_area_ratio_thr – the area ratio threshold for filtering out
-those large predicted bboxes. If the area of a predicted bbox
-accounts for more than this ratio threshold of the whole image
-area, this bbox will be removed. Default: 0.95.
-conf_thr – the confidence score threshold for removing
-low-confidence bboxes. If the confidence score of a predicted bbox
-is lower than the threshold, this bbox will be removed. Default: 0.
-args – extra args
-kwargs – extra args
+args – Extra positional arguments.
+kwargs – Extra keyword arguments.
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -806,8 +736,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -822,24 +752,25 @@
-
-class data_juicer.ops.filter. MaximumLineLengthFilter ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImageNSFWFilter ( hf_nsfw_model : str = 'Falconsai/nsfw_image_detection' , trust_remote_code : bool = False , score_threshold : float = 0.5 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with maximum line length within a specific
-range.
+Filter to keep samples whose images have low nsfw scores.
-
-__init__ ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+__init__ ( hf_nsfw_model : str = 'Falconsai/nsfw_image_detection' , trust_remote_code : bool = False , score_threshold : float = 0.5 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_len – The min filter length in this op, samples will
-be filtered if their maximum line length is below this
-parameter.
-max_len – The max filter length in this op, samples will
-be filtered if their maximum line length exceeds this
-parameter.
+hf_nsfw_model – nsfw detection model name on huggingface.
+score_threshold – the nsfw score threshold for samples.
+range from 0 to 1. Samples with nsfw score less than this threshold
+will be kept.
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
+condition.
args – extra args
kwargs – extra args
@@ -848,75 +779,133 @@
-
-compute_stats_batched ( samples , context = False ) [source]
-
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
+Compute stats for the sample which is used as a metric to decide
+whether to filter this sample.
+
+Parameters:
+
+
+Returns:
+sample with computed stats
+
+
+
-
-process_batched ( samples ) [source]
-
+
+process_single ( sample , rank = None ) [source]
+For sample level, sample –> Boolean.
+
+Parameters:
+sample – sample to decide whether to filter
+
+Returns:
+true for keeping and false for filtering
+
+
+
-
-class data_juicer.ops.filter. AverageLineLengthFilter ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImagePairSimilarityFilter ( hf_clip = 'openai/clip-vit-base-patch32' , trust_remote_code = False , min_score : ClosedUnitInterval = 0.1 , max_score : ClosedUnitInterval = 1.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with average line length within a specific
-range.
+Filter to keep image pairs with similarities between images
+within a specific range.
-
-__init__ ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+__init__ ( hf_clip = 'openai/clip-vit-base-patch32' , trust_remote_code = False , min_score : ClosedUnitInterval = 0.1 , max_score : ClosedUnitInterval = 1.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
+
+
+param hf_clip:
+clip model name on huggingface to compute
+the similarity between image and text.
+
+param min_score:
+The min similarity to keep samples.
+
+param max_score:
+The max similarity to keep samples.
+
+param any_or_all:
+keep this sample with ‘any’ or ‘all’ strategy of
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
+condition.
+
+param args:
+extra args
+
+param kwargs:
+extra args
+
+
+
+
+
+
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
+Compute stats for the sample which is used as a metric to decide
+whether to filter this sample.
Parameters:
-min_len – The min filter length in this op, samples will
-be filtered if their average line length is below this
-parameter.
-max_len – The max filter length in this op, samples will
-be filtered if their average line length exceeds this
-parameter.
-args – extra args
-kwargs – extra args
+sample – input sample.
+context – whether to store context information of intermediate
+vars in the sample temporarily.
+Returns:
+sample with computed stats
+
-
-compute_stats_batched ( samples , context = False ) [source]
-
-
-
-
-process_batched ( samples ) [source]
-
+
+process_single ( sample , rank = None ) [source]
+For sample level, sample –> Boolean.
+
+Parameters:
+sample – sample to decide whether to filter
+
+Returns:
+true for keeping and false for filtering
+
+
+
-
-class data_juicer.ops.filter. SpecifiedFieldFilter ( field_key : str = '' , target_value : List = [] , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImageShapeFilter ( min_width : int = 1 , max_width : int = 9223372036854775807 , min_height : int = 1 , max_height : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter based on specified field information.
-If the specified field information in the sample is not within the
-specified target value, the sample will be filtered.
+Filter to keep samples with image shape (w, h) within specific ranges.
-
-__init__ ( field_key : str = '' , target_value : List = [] , * args , ** kwargs ) [source]
+
+__init__ ( min_width : int = 1 , max_width : int = 9223372036854775807 , min_height : int = 1 , max_height : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-field_key – Filter based on the specified value
-corresponding to the target key. The target key
-corresponding to multi-level field information need to be
-separated by ‘.’.
-target_value – The range of specified field information
-corresponding to the samples that need to be retained.
+min_width – The min width to keep samples.
+max_width – The max width to keep samples.
+min_height – The min height to keep samples.
+max_height – The max height to keep samples.
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
+condition.
args – extra args
kwargs – extra args
@@ -925,8 +914,8 @@
-
-compute_stats_single ( sample ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -944,8 +933,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -960,40 +949,25 @@
-
-class data_juicer.ops.filter. VideoTaggingFromFramesFilter ( tags : List [ str ] = ['people'] , contain : str = 'any' , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , tag_field_name : str = '__dj__video_frame_tags__' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImageSizeFilter ( min_size : str = '0' , max_size : str = '1TB' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples whose videos contain the given tags.
+Keep data samples whose image size (in Bytes/KB/MB/…) within a
+specific range.
-
-__init__ ( tags : List [ str ] = ['people'] , contain : str = 'any' , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , tag_field_name : str = '__dj__video_frame_tags__' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( min_size : str = '0' , max_size : str = '1TB' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-tags – a tag list to shift the videos, total tags can be found
-in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt # noqa: E501
-contain – require the videos containing ‘any’ or ‘all’ tags.
-When tags equal to [], ‘all’ keeps all samples, ‘any’ keeps no
-sample.
-frame_sampling_method – sampling method of extracting frame
-images from the videos. Should be one of
-[“all_keyframes”, “uniform”].
-The former one extracts all key frames (the number of which depends
-on the duration of the video) and the latter one extract specified
-number of frames uniformly from the video.
-Default: “all_keyframes”.
-frame_num – the number of frames to be extracted uniformly from
-the video. Only works when frame_sampling_method is “uniform”. If
-it’s 1, only the middle frame will be extracted. If it’s 2, only
-the first and the last frames will be extracted. If it’s larger
-than 2, in addition to the first and the last frames, other frames
-will be extracted uniformly within the video duration.
-tag_field_name – the field name to store the tags. It’s
-“__dj__video_frame_tags__” in default.
+min_size – The min image size to keep samples. set to be “0” by
+default for no size constraint
+max_size – The max image size to keep samples. set to be
+“1TB” by default, an approximate for un-limited case
any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
condition.
args – extra args
kwargs – extra args
@@ -1003,8 +977,8 @@
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1022,8 +996,8 @@
-
-process_single ( sample , rank = None ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1038,34 +1012,43 @@
-
-class data_juicer.ops.filter. TextEntityDependencyFilter ( lang : str = 'en' , min_dependency_num : int = 1 , any_or_all : str = 'all' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImageTextMatchingFilter ( hf_blip : str = 'Salesforce/blip-itm-base-coco' , trust_remote_code : bool = False , min_score : float = 0.003 , max_score : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
Bases: Filter
-Identify the entities in the text which are independent with other token,
-and filter them. The text containing no entities will be omitted.
+Filter to keep samples those matching score between image and text
+within a specific range.
-
-__init__ ( lang : str = 'en' , min_dependency_num : int = 1 , any_or_all : str = 'all' , * args , ** kwargs ) [source]
+
+__init__ ( hf_blip : str = 'Salesforce/blip-itm-base-coco' , trust_remote_code : bool = False , min_score : float = 0.003 , max_score : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-lang – language of the text in the samples. ‘en’ for detection of
-entities in English and ‘zh’ for detection of entities in Chinese.
-mini_dependency_num – The min token number in the filtering.
-Objects is independent if their number of edges in the dependency
-tree is below this parameter.
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy.
-‘any’: keep this sample if any objet is dependent. ‘all’: keep this
-sample only if all images are dependent.
+hf_blip – blip model name on huggingface to compute
+the matching score between image and text.
+min_score – The min matching score to keep samples.
+max_score – The max matching score to keep samples.
+horizontal_flip – Flip image horizontally (left to right).
+vertical_flip – Flip image vertically (top to bottom).
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
+condition.
+reduce_mode – reduce mode when one text corresponds to
+multiple images in a chunk.
+‘avg’: Take the average of multiple values
+‘max’: Take the max of multiple values
+‘min’: Take the min of multiple values
+args – extra args
+kwargs – extra args
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1083,8 +1066,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample , rank = None ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1099,25 +1082,33 @@
-
-class data_juicer.ops.filter. VideoResolutionFilter ( min_width : int = 1 , max_width : int = 9223372036854775807 , min_height : int = 1 , max_height : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. ImageTextSimilarityFilter ( hf_clip : str = 'openai/clip-vit-base-patch32' , trust_remote_code : bool = False , min_score : float = 0.1 , max_score : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
Bases: Filter
-Keep data samples whose videos’ resolutions are within a specified range.
+Filter to keep samples those similarities between image and text
+within a specific range.
-
-__init__ ( min_width : int = 1 , max_width : int = 9223372036854775807 , min_height : int = 1 , max_height : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( hf_clip : str = 'openai/clip-vit-base-patch32' , trust_remote_code : bool = False , min_score : float = 0.1 , max_score : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_width – The min horizontal resolution.
-max_width – The max horizontal resolution.
-min_height – The min vertical resolution.
-max_height – The max vertical resolution.
+hf_clip – clip model name on huggingface to compute
+the similarity between image and text.
+min_score – The min similarity to keep samples.
+max_score – The max similarity to keep samples.
+horizontal_flip – Flip image horizontally (left to right).
+vertical_flip – Flip image vertically (top to bottom).
any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
condition.
+reduce_mode – reduce mode when one text corresponds to
+multiple images in a chunk.
+‘avg’: Take the average of multiple values
+‘max’: Take the max of multiple values
+‘min’: Take the min of multiple values
args – extra args
kwargs – extra args
@@ -1126,8 +1117,8 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1145,8 +1136,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample , rank = None ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1160,48 +1151,6 @@
-
-
-class data_juicer.ops.filter. AlphanumericFilter ( tokenization : bool = False , min_ratio : float = 0.25 , max_ratio : float = 9223372036854775807 , * args , ** kwargs ) [source]
-Bases: Filter
-Filter to keep samples with alphabet/numeric ratio within a specific
-range.
-
-
-__init__ ( tokenization : bool = False , min_ratio : float = 0.25 , max_ratio : float = 9223372036854775807 , * args , ** kwargs ) [source]
-Initialization method.
-
-Parameters:
-
-tokenization – Whether to count the ratio of alphanumeric
-to the total number of tokens. if tokenization=False, it
-will count the ratio of alphanumeric to the total number of
-characters.
-min_ratio – The min filter ratio in alphanumeric op,
-samples will be filtered if their alphabet/numeric ratio is
-below this parameter.
-max_ratio – The max filter ratio in alphanumeric op,
-samples will be filtered if their alphabet/numeric ratio
-exceeds this parameter.
-args – extra args
-kwargs – extra args
-
-
-
-
-
-
-
-compute_stats_batched ( samples ) [source]
-
-
-
-
-process_batched ( samples ) [source]
-
-
-
-
class data_juicer.ops.filter. ImageWatermarkFilter ( hf_watermark_model : str = 'amrul-hzz/watermark_detector' , trust_remote_code : bool = False , prob_threshold : float = 0.8 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
@@ -1267,37 +1216,31 @@
-
-class data_juicer.ops.filter. ImageAestheticsFilter ( hf_scorer_model : str = '' , trust_remote_code : bool = False , min_score : float = 0.5 , max_score : float = 1.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. LanguageIDScoreFilter ( lang : str | List [ str ] = '' , min_score : float = 0.8 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with aesthetics scores within a specific range.
+Filter to keep samples in a specific language with confidence score
+larger than a specific min value.
-
-__init__ ( hf_scorer_model : str = '' , trust_remote_code : bool = False , min_score : float = 0.5 , max_score : float = 1.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( lang : str | List [ str ] = '' , min_score : float = 0.8 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_scorer_model – Huggingface model name for the aesthetics
-predictor. By default, we will use
-‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’,
-refer to pypi.org/project/simple-aesthetics-predictor
-min_score – Min score for the predicted aesthetics in an image.
-max_score – Max score for the predicted aesthetics in an image.
-any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
-condition.
-args – Extra positional arguments.
-kwargs – Extra keyword arguments.
+lang – Samples in which languages to keep.
+min_score – The min language identification confidence
+scores of samples to keep.
+args – extra args
+kwargs – extra args
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1315,8 +1258,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1331,26 +1274,117 @@
-
-class data_juicer.ops.filter. AudioSizeFilter ( min_size : str = '0' , max_size : str = '1TB' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. MaximumLineLengthFilter ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
Bases: Filter
-Keep data samples whose audio size (in bytes/kb/MB/…) within a
-specific range.
+Filter to keep samples with maximum line length within a specific
+range.
-
-__init__ ( min_size : str = '0' , max_size : str = '1TB' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_size – The min audio size to keep samples. set to be “0” by
-default for no size constraint
-max_size – The max audio size to keep samples. set to be
-“1Tb” by default, an approximate for un-limited case
+min_len – The min filter length in this op, samples will
+be filtered if their maximum line length is below this
+parameter.
+max_len – The max filter length in this op, samples will
+be filtered if their maximum line length exceeds this
+parameter.
+args – extra args
+kwargs – extra args
+
+
+
+
+
+
+
+compute_stats_batched ( samples , context = False ) [source]
+
+
+
+
+process_batched ( samples ) [source]
+
+
+
+
+
+
+class data_juicer.ops.filter. PerplexityFilter ( lang : str = 'en' , max_ppl : float = 1500 , * args , ** kwargs ) [source]
+Bases: Filter
+Filter to keep samples with perplexity score less than a specific max
+value.
+
+
+__init__ ( lang : str = 'en' , max_ppl : float = 1500 , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+lang – Compute perplexity for samples in which language.
+max_ppl – The max filter perplexity in this op, samples
+will be filtered if their perplexity exceeds this parameter.
+args – extra args
+kwargs – extra args
+
+
+
+
+
+
+
+compute_stats_batched ( samples , context = False ) [source]
+
+
+
+
+process_batched ( samples ) [source]
+
+
+
+
+
+
+class data_juicer.ops.filter. PhraseGroundingRecallFilter ( hf_owlvit : str = 'google/owlvit-base-patch32' , trust_remote_code : bool = False , min_recall : float = 0.1 , max_recall : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , iou_thr : float = 0.5 , large_area_ratio_thr : float = 0.95 , conf_thr : float = 0.0 , * args , ** kwargs ) [source]
+Bases: Filter
+Filter to keep samples whose locating recalls of phrases extracted
+from text in the images are within a specified range.
+
+
+__init__ ( hf_owlvit : str = 'google/owlvit-base-patch32' , trust_remote_code : bool = False , min_recall : float = 0.1 , max_recall : float = 1.0 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , iou_thr : float = 0.5 , large_area_ratio_thr : float = 0.95 , conf_thr : float = 0.0 , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+hf_owlvit – Owl-ViT model name on huggingface to locate the
+phrases extracted from the text.
+min_recall – The min phrase grounding recall to keep samples.
+max_recall – The max phrase grounding recall to keep samples.
+horizontal_flip – Flip image horizontally (left to right).
+vertical_flip – Flip image vertically (top to bottom).
any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all audios. ‘any’: keep this sample if any audios meet the
-condition. ‘all’: keep this sample only if all audios meet the
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
condition.
+reduce_mode – reduce mode when one text corresponds to
+multiple images in a chunk.
+‘avg’: Take the average of multiple values
+‘max’: Take the max of multiple values
+‘min’: Take the min of multiple values
+iou_thr – the IoU threshold for NMS-like post-process. If two
+predicted bboxes are overlap with an IoU larger than this
+threshold, the bbox with less confidence will be removed. Default:
+0.5.
+large_area_ratio_thr – the area ratio threshold for filtering out
+those large predicted bboxes. If the area of a predicted bbox
+accounts for more than this ratio threshold of the whole image
+area, this bbox will be removed. Default: 0.95.
+conf_thr – the confidence score threshold for removing
+low-confidence bboxes. If the confidence score of a predicted bbox
+is lower than the threshold, this bbox will be removed. Default: 0.
args – extra args
kwargs – extra args
@@ -1359,8 +1393,8 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1378,8 +1412,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1394,30 +1428,24 @@
-
-class data_juicer.ops.filter. StopWordsFilter ( lang : str = 'en' , tokenization : bool = False , min_ratio : float = 0.3 , stopwords_dir : str = '/home/runner/.cache/data_juicer/assets' , use_words_aug : bool = False , words_aug_group_sizes : List [ int [ int ] ] = [2] , words_aug_join_char : str = '' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. SpecialCharactersFilter ( min_ratio : float = 0.0 , max_ratio : float = 0.25 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with stopword ratio larger than a specific min
-value.
+Filter to keep samples with special-char ratio within a specific
+range.
-
-__init__ ( lang : str = 'en' , tokenization : bool = False , min_ratio : float = 0.3 , stopwords_dir : str = '/home/runner/.cache/data_juicer/assets' , use_words_aug : bool = False , words_aug_group_sizes : List [ int [ int ] ] = [2] , words_aug_join_char : str = '' , * args , ** kwargs ) [source]
+
+__init__ ( min_ratio : float = 0.0 , max_ratio : float = 0.25 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-lang – Consider stopwords in what language. If lang ==
-“all”, we will adopt the one merged from all the available
-languages
-tokenization – whether to use model to tokenize documents
-min_ratio – The min filter ratio in this op.
-stopwords_dir – The directory storing the stopwords
-file(s) whose name includes “stopwords” and in json format
-use_words_aug – Whether to augment words, especially for
-Chinese and Vietnamese
-words_aug_group_sizes – The group size of words to augment
-words_aug_join_char – The join char between words to
-augment
+min_ratio – The min filter ratio in this op, samples will
+be filtered if their special-char ratio is below this
+parameter.
+max_ratio – The max filter ratio in this op, samples will
+be filtered if their special-char ratio exceeds this
+parameter.
args – extra args
kwargs – extra args
@@ -1426,8 +1454,47 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_batched ( samples ) [source]
+
+
+
+
+process_batched ( samples ) [source]
+
+
+
+
+
+
+class data_juicer.ops.filter. SpecifiedFieldFilter ( field_key : str = '' , target_value : List = [] , * args , ** kwargs ) [source]
+Bases: Filter
+Filter based on specified field information.
+If the specified field information in the sample is not within the
+specified target value, the sample will be filtered.
+
+
+__init__ ( field_key : str = '' , target_value : List = [] , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+field_key – Filter based on the specified value
+corresponding to the target key. The target key
+corresponding to multi-level field information need to be
+separated by ‘.’.
+target_value – The range of specified field information
+corresponding to the samples that need to be retained.
+args – extra args
+kwargs – extra args
+
+
+
+
+
+
+
+compute_stats_single ( sample ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1445,8 +1512,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1461,64 +1528,29 @@
-
-class data_juicer.ops.filter. CharacterRepetitionFilter ( rep_len : int [ int ] = 10 , min_ratio : float = 0.0 , max_ratio : float = 0.5 , * args , ** kwargs ) [source]
-Bases: Filter
-Filter to keep samples with char-level n-gram repetition ratio within a
-specific range.
-
-
-__init__ ( rep_len : int [ int ] = 10 , min_ratio : float = 0.0 , max_ratio : float = 0.5 , * args , ** kwargs ) [source]
-Initialization method.
-
-Parameters:
-
-rep_len – Repetition length for char-level n-gram.
-min_ratio – The min filter ratio in this op, samples will
-be filtered if their char-level n-gram repetition ratio is
-below this parameter.
-max_ratio – The max filter ratio in this op, samples will
-be filtered if their char-level n-gram repetition ratio
-exceeds this parameter.
-args – extra args
-kwargs – extra args
-
-
-
-
-
-
-
-compute_stats_batched ( samples ) [source]
-
-
-
-
-process_batched ( samples ) [source]
-
-
-
-
-
-
-class data_juicer.ops.filter. ImageShapeFilter ( min_width : int = 1 , max_width : int = 9223372036854775807 , min_height : int = 1 , max_height : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. SpecifiedNumericFieldFilter ( field_key : str = '' , min_value : float = -9223372036854775807 , max_value : float = 9223372036854775807 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with image shape (w, h) within specific ranges.
+Filter based on specified numeric field information.
+If the specified numeric information in the sample is not within the
+specified range, the sample will be filtered.
-
-__init__ ( min_width : int = 1 , max_width : int = 9223372036854775807 , min_height : int = 1 , max_height : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( field_key : str = '' , min_value : float = -9223372036854775807 , max_value : float = 9223372036854775807 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_width – The min width to keep samples.
-max_width – The max width to keep samples.
-min_height – The min height to keep samples.
-max_height – The max height to keep samples.
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
-condition.
+field_key – Filter based on the specified numeric value
+corresponding to the target key. The target key
+corresponding to multi-level field information need to be
+separated by ‘.’.
+min_value – The min filter value in SpecifiedNumericField
+op, samples will be filtered if their specified numeric
+field value is below this parameter.
+max_value – The max filter value in SpecifiedNumericField
+op, samples will be filtered if their specified numeric
+field value exceeds this parameter.
args – extra args
kwargs – extra args
@@ -1527,8 +1559,8 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1546,8 +1578,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1562,25 +1594,30 @@
-
-class data_juicer.ops.filter. VideoDurationFilter ( min_duration : float = 0 , max_duration : float = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. StopWordsFilter ( lang : str = 'en' , tokenization : bool = False , min_ratio : float = 0.3 , stopwords_dir : str = '/home/runner/.cache/data_juicer/assets' , use_words_aug : bool = False , words_aug_group_sizes : List [ int [ int ] ] = [2] , words_aug_join_char : str = '' , * args , ** kwargs ) [source]
Bases: Filter
-Keep data samples whose videos’ durations are within a specified range.
+Filter to keep samples with stopword ratio larger than a specific min
+value.
-
-__init__ ( min_duration : float = 0 , max_duration : float = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( lang : str = 'en' , tokenization : bool = False , min_ratio : float = 0.3 , stopwords_dir : str = '/home/runner/.cache/data_juicer/assets' , use_words_aug : bool = False , words_aug_group_sizes : List [ int [ int ] ] = [2] , words_aug_join_char : str = '' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_duration – The min video duration to keep samples in seconds.
-It’s 0 by default.
-max_duration – The max video duration to keep samples in seconds.
-It’s sys.maxsize by default.
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
-condition.
+lang – Consider stopwords in what language. If lang ==
+“all”, we will adopt the one merged from all the available
+languages
+tokenization – whether to use model to tokenize documents
+min_ratio – The min filter ratio in this op.
+stopwords_dir – The directory storing the stopwords
+file(s) whose name includes “stopwords” and in json format
+use_words_aug – Whether to augment words, especially for
+Chinese and Vietnamese
+words_aug_group_sizes – The group size of words to augment
+words_aug_join_char – The join char between words to
+augment
args – extra args
kwargs – extra args
@@ -1589,8 +1626,8 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1608,8 +1645,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1624,30 +1661,29 @@
-
-class data_juicer.ops.filter. TextActionFilter ( lang : str = 'en' , min_action_num : int = 1 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. SuffixFilter ( suffixes : str | List [ str ] = [] , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep texts those contain actions in the text.
+Filter to keep samples with specified suffix.
-
-__init__ ( lang : str = 'en' , min_action_num : int = 1 , * args , ** kwargs ) [source]
+
+__init__ ( suffixes : str | List [ str ] = [] , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-lang – language of the text in the samples. ‘en’ for detection of
-actions in English and ‘zh’ for detection of actions in Chinese.
-mini_action_num – The min action number in the filtering. samples
-will be filtered if their action number in the text is below this
-parameter.
+suffixes – the suffix of text that will be keep.
+For example: ‘.txt’, ‘txt’ or [‘txt’, ‘.pdf’, ‘docx’]
+args – extra args
+kwargs – extra args
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1665,8 +1701,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1681,49 +1717,30 @@
-
-class data_juicer.ops.filter. VideoOcrAreaRatioFilter ( min_area_ratio : float = 0 , max_area_ratio : float = 1.0 , frame_sample_num : int [ int ] = 3 , languages_to_detect : str | List [ str ] = ['ch_sim', 'en'] , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. TextActionFilter ( lang : str = 'en' , min_action_num : int = 1 , * args , ** kwargs ) [source]
Bases: Filter
-Keep data samples whose detected text area ratios for specified frames
-in the video are within a specified range.
+Filter to keep texts those contain actions in the text.
-
-__init__ ( min_area_ratio : float = 0 , max_area_ratio : float = 1.0 , frame_sample_num : int [ int ] = 3 , languages_to_detect : str | List [ str ] = ['ch_sim', 'en'] , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( lang : str = 'en' , min_action_num : int = 1 , * args , ** kwargs ) [source]
Initialization method.
-
-Parameters:
-
-min_area_ratio – The min ocr area ratio to keep samples. It’s 0
-by default.
-max_area_ratio – The max ocr area ratio to keep samples. It’s 1.0
-by default.
-frame_sample_num – The number of sampled frames to calculate the
-ocr area ratio. If it’s 1, only middle frame will be selected. If
-it’s 2, only the first and the last frames will be selected. If
-it’s larger than 2, in addition to the first and the last frames,
-other frames will be sampled evenly within the video duration.
-languages_to_detect – texts in which languages should be
-detected. Default: [‘ch_sim’, ‘en’]. Full language list can be
-found here: https://www.jaided.ai/easyocr/ .
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
-condition.
-args – extra args
-kwargs – extra args
+
+Parameters:
+
+lang – language of the text in the samples. ‘en’ for detection of
+actions in English and ‘zh’ for detection of actions in Chinese.
+mini_action_num – The min action number in the filtering. samples
+will be filtered if their action number in the text is below this
+parameter.
-
-get_reader ( rank ) [source]
-
-
-
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1741,8 +1758,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1757,52 +1774,34 @@
-
-class data_juicer.ops.filter. VideoNSFWFilter ( hf_nsfw_model : str = 'Falconsai/nsfw_image_detection' , trust_remote_code : bool = False , score_threshold : float = 0.5 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , reduce_mode : str = 'avg' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. TextEntityDependencyFilter ( lang : str = 'en' , min_dependency_num : int = 1 , any_or_all : str = 'all' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples whose videos have low nsfw scores.
+Identify the entities in the text which are independent with other token,
+and filter them. The text containing no entities will be omitted.
-
-__init__ ( hf_nsfw_model : str = 'Falconsai/nsfw_image_detection' , trust_remote_code : bool = False , score_threshold : float = 0.5 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , reduce_mode : str = 'avg' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( lang : str = 'en' , min_dependency_num : int = 1 , any_or_all : str = 'all' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_nsfw_model – nsfw detection model name on huggingface.
-score_threshold – the nsfw score threshold for samples.
-range from 0 to 1. Samples with nsfw score less than this threshold
-will be kept.
-frame_sampling_method – sampling method of extracting frame
-images from the videos.
-Should be one of [“all_keyframes”, “uniform”].
-The former one extracts all key frames (the number of which depends
-on the duration of the video) and the latter one extract specified
-number of frames uniformly from the video.
-Default: “all_keyframes”.
-frame_num – the number of frames to be extracted uniformly from
-the video. Only works when frame_sampling_method is “uniform”. If
-it’s 1, only the middle frame will be extracted. If it’s 2, only
-the first and the last frames will be extracted. If it’s larger
-than 2, in addition to the first and the last frames, other frames
-will be extracted uniformly within the video duration.
-reduce_mode – reduce mode for multiple sampled video frames.
-‘avg’: Take the average of multiple values
-‘max’: Take the max of multiple values
-‘min’: Take the min of multiple values
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
-condition.
-args – extra args
-kwargs – extra args
+lang – language of the text in the samples. ‘en’ for detection of
+entities in English and ‘zh’ for detection of entities in Chinese.
+mini_dependency_num – The min token number in the filtering.
+Objects is independent if their number of edges in the dependency
+tree is below this parameter.
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy.
+‘any’: keep this sample if any objet is dependent. ‘all’: keep this
+sample only if all images are dependent.
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1820,8 +1819,8 @@
-
-process_single ( sample , rank = None ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1836,23 +1835,23 @@
-
-class data_juicer.ops.filter. SpecialCharactersFilter ( min_ratio : float = 0.0 , max_ratio : float = 0.25 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. TextLengthFilter ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with special-char ratio within a specific
+
Filter to keep samples with total text length within a specific
range.
-
-__init__ ( min_ratio : float = 0.0 , max_ratio : float = 0.25 , * args , ** kwargs ) [source]
+
+__init__ ( min_len : int = 10 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_ratio – The min filter ratio in this op, samples will
-be filtered if their special-char ratio is below this
+
min_len – The min text length in the filtering. samples
+will be filtered if their text length is below this
parameter.
-max_ratio – The max filter ratio in this op, samples will
-be filtered if their special-char ratio exceeds this
+
max_len – The max text length in the filtering. samples
+will be filtered if their text length exceeds this
parameter.
args – extra args
kwargs – extra args
@@ -1862,60 +1861,37 @@
-
-compute_stats_batched ( samples ) [source]
+
+compute_stats_batched ( samples ) [source]
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.filter. VideoFramesTextSimilarityFilter ( hf_clip = 'openai/clip-vit-base-patch32' , trust_remote_code = False , min_score : float = 0.1 , max_score : float = 1.0 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. TokenNumFilter ( hf_tokenizer : str = 'EleutherAI/pythia-6.9b-deduped' , min_num : int = 10 , max_num : int = 9223372036854775807 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples those similarities between sampled video frame
-images and text within a specific range.
+Filter to keep samples with total token number within a specific
+range.
-
-__init__ ( hf_clip = 'openai/clip-vit-base-patch32' , trust_remote_code = False , min_score : float = 0.1 , max_score : float = 1.0 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
+
+__init__ ( hf_tokenizer : str = 'EleutherAI/pythia-6.9b-deduped' , min_num : int = 10 , max_num : int = 9223372036854775807 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_clip – clip model name on huggingface to compute
-the similarity between frame image and text. It’s kind of
-language-related. For example, for Chinese datasets, ChineseCLIP
-might be a better choice.
-min_score – the min similarity to keep samples.
-max_score – the max similarity to keep samples.
-frame_sampling_method – sampling method of extracting frame
-images from the videos.
-Should be one of [“all_keyframes”, “uniform”].
-The former one extracts all key frames (the number of which depends
-on the duration of the video) and the latter one extract specified
-number of frames uniformly from the video.
-Default: “all_keyframes”.
-frame_num – the number of frames to be extracted uniformly from
-the video. Only works when frame_sampling_method is “uniform”. If
-it’s 1, only the middle frame will be extracted. If it’s 2, only
-the first and the last frames will be extracted. If it’s larger
-than 2, in addition to the first and the last frames, other frames
-will be extracted uniformly within the video duration.
-horizontal_flip – flip frame image horizontally (left to right).
-vertical_flip – flip frame image vertically (top to bottom).
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
-condition.
-reduce_mode – reduce mode when one text corresponds to
-multiple video frame images in a chunk.
-‘avg’: Take the average of multiple values
-‘max’: Take the max of multiple values
-‘min’: Take the min of multiple values
+hf_tokenizer – the tokenizer name of Hugging Face tokenizers.
+min_num – The min filter token number in this op, samples
+will be filtered if their token number is below this
+parameter.
+max_num – The max filter token number in this op, samples
+will be filtered if their token number exceeds this
+parameter.
args – extra args
kwargs – extra args
@@ -1924,8 +1900,8 @@
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -1943,8 +1919,8 @@
-
-process_single ( sample , rank = None ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -1959,34 +1935,57 @@
-
-class data_juicer.ops.filter. ImageAspectRatioFilter ( min_ratio : float = 0.333 , max_ratio : float = 3.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoAestheticsFilter ( hf_scorer_model : str = '' , trust_remote_code : bool = False , min_score : float = 0.4 , max_score : float = 1.0 , frame_sampling_method : str = 'uniform' , frame_num : int [ int ] = 3 , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with image aspect ratio within a specific range.
-AspectRatio = W / H.
+Filter to keep data samples with aesthetics scores for specified frames
+in the videos within a specific range.
-
-__init__ ( min_ratio : float = 0.333 , max_ratio : float = 3.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( hf_scorer_model : str = '' , trust_remote_code : bool = False , min_score : float = 0.4 , max_score : float = 1.0 , frame_sampling_method : str = 'uniform' , frame_num : int [ int ] = 3 , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_ratio – The min aspect ratio to keep samples.
-max_ratio – The max aspect ratio to keep samples.
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
+
hf_scorer_model – Huggingface model name for the aesthetics
+predictor. By default, we will use
+‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’,
+refer to pypi.org/project/simple-aesthetics-predictor
+min_score – Min score for the predicted aesthetics in a video.
+max_score – Max score for the predicted aesthetics in a video.
+frame_sampling_method – sampling method of extracting frame
+images from the videos.
+Should be one of [“all_keyframes”, “uniform”].
+The former one extracts all key frames and the latter one extract
+specified number of frames uniformly from the video.
+Default: “uniform” with frame_num=3, considering that the number of
+keyframes can be large while their difference is usually small
+in terms of their aesthetics.
+frame_num – the number of frames to be extracted uniformly from
+the video. Only works when frame_sampling_method is “uniform”. If
+it’s 1, only the middle frame will be extracted. If it’s 2, only
+the first and the last frames will be extracted. If it’s larger
+than 2, in addition to the first and the last frames, other frames
+will be extracted uniformly within the video duration.
+any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
condition.
-args – extra args
-kwargs – extra args
+reduce_mode – reduce mode when one sample corresponds to
+multiple frames, must be one of [‘avg’,’max’, ‘min’].
+‘avg’: Take the average of multiple values
+‘max’: Take the max of multiple values
+‘min’: Take the min of multiple values
+args – Extra positional arguments.
+kwargs – Extra keyword arguments.
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2004,8 +2003,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2020,24 +2019,25 @@
-
-class data_juicer.ops.filter. AudioDurationFilter ( min_duration : int = 0 , max_duration : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoAspectRatioFilter ( min_ratio : str = '9/21' , max_ratio : str = '21/9' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Keep data samples whose audios’ durations are within a specified range.
+Filter to keep samples with video aspect ratio within a specific range.
+AspectRatio = W / H.
-
-__init__ ( min_duration : int = 0 , max_duration : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( min_ratio : str = '9/21' , max_ratio : str = '21/9' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_duration – The min audio duration to keep samples in seconds.
-It’s 0 by default.
-max_duration – The max audio duration to keep samples in seconds.
-It’s sys.maxsize by default.
+min_ratio – The minimum aspect ratio to keep samples,
+supported format is a string, such as “9:21” or “9/21”.
+max_ratio – The maximum aspect ratio to keep samples,
+supported format is a string, such as “21:9” or “21/9”.
any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all audios. ‘any’: keep this sample if any audios meet the
-condition. ‘all’: keep this sample only if all audios meet the
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
condition.
args – extra args
kwargs – extra args
@@ -2047,8 +2047,8 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2066,8 +2066,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2082,21 +2082,25 @@
-
-class data_juicer.ops.filter. LanguageIDScoreFilter ( lang : str | List [ str ] = '' , min_score : float = 0.8 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoDurationFilter ( min_duration : float = 0 , max_duration : float = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples in a specific language with confidence score
-larger than a specific min value.
+Keep data samples whose videos’ durations are within a specified range.
-
-__init__ ( lang : str | List [ str ] = '' , min_score : float = 0.8 , * args , ** kwargs ) [source]
+
+__init__ ( min_duration : float = 0 , max_duration : float = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-lang – Samples in which languages to keep.
-min_score – The min language identification confidence
-scores of samples to keep.
+min_duration – The min video duration to keep samples in seconds.
+It’s 0 by default.
+max_duration – The max video duration to keep samples in seconds.
+It’s sys.maxsize by default.
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
+condition.
args – extra args
kwargs – extra args
@@ -2105,8 +2109,8 @@
-
-compute_stats_single ( sample ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2124,8 +2128,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2140,19 +2144,48 @@
-
-class data_juicer.ops.filter. SuffixFilter ( suffixes : str | List [ str ] = [] , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoFramesTextSimilarityFilter ( hf_clip = 'openai/clip-vit-base-patch32' , trust_remote_code = False , min_score : float = 0.1 , max_score : float = 1.0 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with specified suffix.
+Filter to keep samples those similarities between sampled video frame
+images and text within a specific range.
-
-__init__ ( suffixes : str | List [ str ] = [] , * args , ** kwargs ) [source]
+
+__init__ ( hf_clip = 'openai/clip-vit-base-patch32' , trust_remote_code = False , min_score : float = 0.1 , max_score : float = 1.0 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , any_or_all : str = 'any' , reduce_mode : str = 'avg' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-suffixes – the suffix of text that will be keep.
-For example: ‘.txt’, ‘txt’ or [‘txt’, ‘.pdf’, ‘docx’]
+hf_clip – clip model name on huggingface to compute
+the similarity between frame image and text. It’s kind of
+language-related. For example, for Chinese datasets, ChineseCLIP
+might be a better choice.
+min_score – the min similarity to keep samples.
+max_score – the max similarity to keep samples.
+frame_sampling_method – sampling method of extracting frame
+images from the videos.
+Should be one of [“all_keyframes”, “uniform”].
+The former one extracts all key frames (the number of which depends
+on the duration of the video) and the latter one extract specified
+number of frames uniformly from the video.
+Default: “all_keyframes”.
+frame_num – the number of frames to be extracted uniformly from
+the video. Only works when frame_sampling_method is “uniform”. If
+it’s 1, only the middle frame will be extracted. If it’s 2, only
+the first and the last frames will be extracted. If it’s larger
+than 2, in addition to the first and the last frames, other frames
+will be extracted uniformly within the video duration.
+horizontal_flip – flip frame image horizontally (left to right).
+vertical_flip – flip frame image vertically (top to bottom).
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
+condition.
+reduce_mode – reduce mode when one text corresponds to
+multiple video frame images in a chunk.
+‘avg’: Take the average of multiple values
+‘max’: Take the max of multiple values
+‘min’: Take the min of multiple values
args – extra args
kwargs – extra args
@@ -2161,8 +2194,8 @@
-
-compute_stats_single ( sample ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2180,8 +2213,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample , rank = None ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2196,25 +2229,37 @@
-
-class data_juicer.ops.filter. ImageSizeFilter ( min_size : str = '0' , max_size : str = '1TB' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoMotionScoreFilter ( min_score : float = 0.25 , max_score : float = 1.7976931348623157e+308 , sampling_fps : float [ float ] = 2 , size : int [ int ] | Tuple [ int [ int ] ] | Tuple [ int [ int ] , int [ int ] ] | None = None , max_size : int [ int ] | None = None , relative : bool = False , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Keep data samples whose image size (in Bytes/KB/MB/…) within a
-specific range.
+Filter to keep samples with video motion scores within a specific range. The
+Farneback’s algorith from OpenCV is used to compute dense optical flow.
-
-__init__ ( min_size : str = '0' , max_size : str = '1TB' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( min_score : float = 0.25 , max_score : float = 1.7976931348623157e+308 , sampling_fps : float [ float ] = 2 , size : int [ int ] | Tuple [ int [ int ] ] | Tuple [ int [ int ] , int [ int ] ] | None = None , max_size : int [ int ] | None = None , relative : bool = False , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_size – The min image size to keep samples. set to be “0” by
-default for no size constraint
-max_size – The max image size to keep samples. set to be
-“1TB” by default, an approximate for un-limited case
+min_score – The minimum motion score to keep samples.
+max_score – The maximum motion score to keep samples.
+sampling_fps – The sampling rate in frames_per_second for
+optical flow calculations.
+size – Resize frames before computing optical flow. If size is a
+sequence like (h, w), frame size will be matched to this. If size
+is an int, smaller edge of frames will be matched to this number.
+i.e, if height > width, then frame will be rescaled to (size *
+height / width, size). Default None to keep the original size.
+max_size – The maximum allowed for the longer edge of resized
+frames. If the longer edge of frames is greater than max_size after
+being resized according to size, size will be overruled so that the
+longer edge is equal to max_size. As a result, the smaller edge may
+be shorter than size. This is only supported if size is an int.
+relative – If True , the optical flow magnitude is normalized to
+a [0, 1] range, relative to the frame’s diagonal length.
any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
condition.
args – extra args
kwargs – extra args
@@ -2224,8 +2269,8 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2243,8 +2288,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2259,23 +2304,21 @@
-
-class data_juicer.ops.filter. VideoWatermarkFilter ( hf_watermark_model : str = 'amrul-hzz/watermark_detector' , trust_remote_code : bool = False , prob_threshold : float = 0.8 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , reduce_mode : str = 'avg' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoNSFWFilter ( hf_nsfw_model : str = 'Falconsai/nsfw_image_detection' , trust_remote_code : bool = False , score_threshold : float = 0.5 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , reduce_mode : str = 'avg' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples whose videos have no watermark with high
-probability.
+Filter to keep samples whose videos have low nsfw scores.
-
-__init__ ( hf_watermark_model : str = 'amrul-hzz/watermark_detector' , trust_remote_code : bool = False , prob_threshold : float = 0.8 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , reduce_mode : str = 'avg' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( hf_nsfw_model : str = 'Falconsai/nsfw_image_detection' , trust_remote_code : bool = False , score_threshold : float = 0.5 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , reduce_mode : str = 'avg' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_watermark_model – watermark detection model name on
-huggingface.
-prob_threshold – the predicted watermark probability threshold
-for samples. range from 0 to 1. Samples with watermark probability
-less than this threshold will be kept.
+hf_nsfw_model – nsfw detection model name on huggingface.
+score_threshold – the nsfw score threshold for samples.
+range from 0 to 1. Samples with nsfw score less than this threshold
+will be kept.
frame_sampling_method – sampling method of extracting frame
images from the videos.
Should be one of [“all_keyframes”, “uniform”].
@@ -2305,8 +2348,8 @@
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2324,8 +2367,8 @@
-
-process_single ( sample , rank = None ) [source]
+
+process_single ( sample , rank = None ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2340,26 +2383,34 @@
-
-class data_juicer.ops.filter. WordsNumFilter ( lang : str = 'en' , tokenization : bool = False , min_num : int = 10 , max_num : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoOcrAreaRatioFilter ( min_area_ratio : float = 0 , max_area_ratio : float = 1.0 , frame_sample_num : int [ int ] = 3 , languages_to_detect : str | List [ str ] = ['ch_sim', 'en'] , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with total words number within a specific
-range.
+Keep data samples whose detected text area ratios for specified frames
+in the video are within a specified range.
-
-__init__ ( lang : str = 'en' , tokenization : bool = False , min_num : int = 10 , max_num : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+__init__ ( min_area_ratio : float = 0 , max_area_ratio : float = 1.0 , frame_sample_num : int [ int ] = 3 , languages_to_detect : str | List [ str ] = ['ch_sim', 'en'] , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-lang – sample in which language.
-tokenization – whether to use model to tokenize documents
-min_num – The min filter word number in this op, samples
-will be filtered if their word number is below this
-parameter.
-max_num – The max filter word number in this op, samples
-will be filtered if their word number exceeds this
-parameter.
+min_area_ratio – The min ocr area ratio to keep samples. It’s 0
+by default.
+max_area_ratio – The max ocr area ratio to keep samples. It’s 1.0
+by default.
+frame_sample_num – The number of sampled frames to calculate the
+ocr area ratio. If it’s 1, only middle frame will be selected. If
+it’s 2, only the first and the last frames will be selected. If
+it’s larger than 2, in addition to the first and the last frames,
+other frames will be sampled evenly within the video duration.
+languages_to_detect – texts in which languages should be
+detected. Default: [‘ch_sim’, ‘en’]. Full language list can be
+found here: https://www.jaided.ai/easyocr/ .
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
+condition.
args – extra args
kwargs – extra args
@@ -2368,47 +2419,75 @@
-
-compute_stats_batched ( samples , context = False ) [source]
+
+get_reader ( rank ) [source]
-
-process_batched ( samples ) [source]
-
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
+Compute stats for the sample which is used as a metric to decide
+whether to filter this sample.
+
+Parameters:
+
+
+Returns:
+sample with computed stats
+
+
+
+
+
+
+process_single ( sample ) [source]
+For sample level, sample –> Boolean.
+
+Parameters:
+sample – sample to decide whether to filter
+
+Returns:
+true for keeping and false for filtering
+
+
+
-
-class data_juicer.ops.filter. ImageFaceCountFilter ( cv_classifier : str = '' , min_face_count : int = 1 , max_face_count : int = 1 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoResolutionFilter ( min_width : int = 1 , max_width : int = 9223372036854775807 , min_height : int = 1 , max_height : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with the number of faces within a specific range.
+Keep data samples whose videos’ resolutions are within a specified range.
-
-__init__ ( cv_classifier : str = '' , min_face_count : int = 1 , max_face_count : int = 1 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( min_width : int = 1 , max_width : int = 9223372036854775807 , min_height : int = 1 , max_height : int = 9223372036854775807 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-cv_classifier – OpenCV classifier path for face detection.
-By default, we will use ‘haarcascade_frontalface_alt.xml’.
-min_face_count – Minimum number of faces required for samples.
-max_face_count – Maximum number of faces required for samples.
-any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
+
min_width – The min horizontal resolution.
+max_width – The max horizontal resolution.
+min_height – The min vertical resolution.
+max_height – The max vertical resolution.
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
condition.
-args – Extra positional arguments.
-kwargs – Extra keyword arguments.
+args – extra args
+kwargs – extra args
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2426,8 +2505,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2442,35 +2521,51 @@
-
-class data_juicer.ops.filter. ImageFaceRatioFilter ( cv_classifier : str = '' , min_ratio : float = 0.0 , max_ratio : float = 0.4 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoTaggingFromFramesFilter ( tags : List [ str ] = ['people'] , contain : str = 'any' , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , tag_field_name : str = '__dj__video_frame_tags__' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with face area ratios within a specific range.
+Filter to keep samples whose videos contain the given tags.
-
-__init__ ( cv_classifier : str = '' , min_ratio : float = 0.0 , max_ratio : float = 0.4 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( tags : List [ str ] = ['people'] , contain : str = 'any' , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , tag_field_name : str = '__dj__video_frame_tags__' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-cv_classifier – OpenCV classifier path for face detection.
-By default, we will use ‘haarcascade_frontalface_alt.xml’.
-min_ratio – Min ratio for the largest face area in an image.
-max_ratio – Max ratio for the largest face area in an image.
-any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
+
tags – a tag list to shift the videos, total tags can be found
+in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt # noqa: E501
+contain – require the videos containing ‘any’ or ‘all’ tags.
+When tags equal to [], ‘all’ keeps all samples, ‘any’ keeps no
+sample.
+frame_sampling_method – sampling method of extracting frame
+images from the videos. Should be one of
+[“all_keyframes”, “uniform”].
+The former one extracts all key frames (the number of which depends
+on the duration of the video) and the latter one extract specified
+number of frames uniformly from the video.
+Default: “all_keyframes”.
+frame_num – the number of frames to be extracted uniformly from
+the video. Only works when frame_sampling_method is “uniform”. If
+it’s 1, only the middle frame will be extracted. If it’s 2, only
+the first and the last frames will be extracted. If it’s larger
+than 2, in addition to the first and the last frames, other frames
+will be extracted uniformly within the video duration.
+tag_field_name – the field name to store the tags. It’s
+“__dj__video_frame_tags__” in default.
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
condition.
-args – Extra positional arguments.
-kwargs – Extra keyword arguments.
+args – extra args
+kwargs – extra args
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2488,8 +2583,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample , rank = None ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2504,31 +2599,44 @@
-
-class data_juicer.ops.filter. FlaggedWordFilter ( lang : str = 'en' , tokenization : bool = False , max_ratio : float = 0.045 , flagged_words_dir : str = '/home/runner/.cache/data_juicer/assets' , use_words_aug : bool = False , words_aug_group_sizes : List [ int [ int ] ] = [2] , words_aug_join_char : str = '' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. VideoWatermarkFilter ( hf_watermark_model : str = 'amrul-hzz/watermark_detector' , trust_remote_code : bool = False , prob_threshold : float = 0.8 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , reduce_mode : str = 'avg' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with flagged-word ratio less than a specific max
-value.
+Filter to keep samples whose videos have no watermark with high
+probability.
-
-__init__ ( lang : str = 'en' , tokenization : bool = False , max_ratio : float = 0.045 , flagged_words_dir : str = '/home/runner/.cache/data_juicer/assets' , use_words_aug : bool = False , words_aug_group_sizes : List [ int [ int ] ] = [2] , words_aug_join_char : str = '' , * args , ** kwargs ) [source]
+
+__init__ ( hf_watermark_model : str = 'amrul-hzz/watermark_detector' , trust_remote_code : bool = False , prob_threshold : float = 0.8 , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , reduce_mode : str = 'avg' , any_or_all : str = 'any' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-lang – Consider flagged words in what language. If lang ==
-“all”, we will adopt the one merged from all the available
-languages
-tokenization – Whether to use model to tokenize documents
-max_ratio – The max filter ratio in this op.
-flagged_words_dir – The directory storing the
-flagged_words file(s) whose name includes “flagged_words”
-and in json format
-use_words_aug – Whether to augment words, especially for
-Chinese and Vietnamese
-words_aug_group_sizes – The group size of words to augment
-words_aug_join_char – The join char between words to
-augment
+hf_watermark_model – watermark detection model name on
+huggingface.
+prob_threshold – the predicted watermark probability threshold
+for samples. range from 0 to 1. Samples with watermark probability
+less than this threshold will be kept.
+frame_sampling_method – sampling method of extracting frame
+images from the videos.
+Should be one of [“all_keyframes”, “uniform”].
+The former one extracts all key frames (the number of which depends
+on the duration of the video) and the latter one extract specified
+number of frames uniformly from the video.
+Default: “all_keyframes”.
+frame_num – the number of frames to be extracted uniformly from
+the video. Only works when frame_sampling_method is “uniform”. If
+it’s 1, only the middle frame will be extracted. If it’s 2, only
+the first and the last frames will be extracted. If it’s larger
+than 2, in addition to the first and the last frames, other frames
+will be extracted uniformly within the video duration.
+reduce_mode – reduce mode for multiple sampled video frames.
+‘avg’: Take the average of multiple values
+‘max’: Take the max of multiple values
+‘min’: Take the min of multiple values
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all videos. ‘any’: keep this sample if any videos meet the
+condition. ‘all’: keep this sample only if all videos meet the
+condition.
args – extra args
kwargs – extra args
@@ -2537,8 +2645,8 @@
-
-compute_stats_single ( sample , context = False ) [source]
+
+compute_stats_single ( sample , rank = None , context = False ) [source]
Compute stats for the sample which is used as a metric to decide
whether to filter this sample.
@@ -2556,8 +2664,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample , rank = None ) [source]
For sample level, sample –> Boolean.
Parameters:
@@ -2613,38 +2721,26 @@
-
-class data_juicer.ops.filter. VideoMotionScoreFilter ( min_score : float = 0.25 , max_score : float = 1.7976931348623157e+308 , sampling_fps : float [ float ] = 2 , size : int [ int ] | Tuple [ int [ int ] ] | Tuple [ int [ int ] , int [ int ] ] | None = None , max_size : int [ int ] | None = None , relative : bool = False , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.filter. WordsNumFilter ( lang : str = 'en' , tokenization : bool = False , min_num : int = 10 , max_num : int = 9223372036854775807 , * args , ** kwargs ) [source]
Bases: Filter
-Filter to keep samples with video motion scores within a specific range. The
-Farneback’s algorith from OpenCV is used to compute dense optical flow.
+Filter to keep samples with total words number within a specific
+range.
-
-__init__ ( min_score : float = 0.25 , max_score : float = 1.7976931348623157e+308 , sampling_fps : float [ float ] = 2 , size : int [ int ] | Tuple [ int [ int ] ] | Tuple [ int [ int ] , int [ int ] ] | None = None , max_size : int [ int ] | None = None , relative : bool = False , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( lang : str = 'en' , tokenization : bool = False , min_num : int = 10 , max_num : int = 9223372036854775807 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_score – The minimum motion score to keep samples.
-max_score – The maximum motion score to keep samples.
-sampling_fps – The sampling rate in frames_per_second for
-optical flow calculations.
-size – Resize frames before computing optical flow. If size is a
-sequence like (h, w), frame size will be matched to this. If size
-is an int, smaller edge of frames will be matched to this number.
-i.e, if height > width, then frame will be rescaled to (size *
-height / width, size). Default None to keep the original size.
-max_size – The maximum allowed for the longer edge of resized
-frames. If the longer edge of frames is greater than max_size after
-being resized according to size, size will be overruled so that the
-longer edge is equal to max_size. As a result, the smaller edge may
-be shorter than size. This is only supported if size is an int.
-relative – If True , the optical flow magnitude is normalized to
-a [0, 1] range, relative to the frame’s diagonal length.
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all videos. ‘any’: keep this sample if any videos meet the
-condition. ‘all’: keep this sample only if all videos meet the
-condition.
+lang – sample in which language.
+tokenization – whether to use model to tokenize documents
+min_num – The min filter word number in this op, samples
+will be filtered if their word number is below this
+parameter.
+max_num – The max filter word number in this op, samples
+will be filtered if their word number exceeds this
+parameter.
args – extra args
kwargs – extra args
@@ -2653,110 +2749,14 @@
-
-compute_stats_single ( sample , context = False ) [source]
-Compute stats for the sample which is used as a metric to decide
-whether to filter this sample.
-
-Parameters:
-
-
-Returns:
-sample with computed stats
-
-
-
-
-
-
-process_single ( sample ) [source]
-For sample level, sample –> Boolean.
-
-Parameters:
-sample – sample to decide whether to filter
-
-Returns:
-true for keeping and false for filtering
-
-
-
-
-
-
-
-
-class data_juicer.ops.filter. ImagePairSimilarityFilter ( hf_clip = 'openai/clip-vit-base-patch32' , trust_remote_code = False , min_score : ClosedUnitInterval = 0.1 , max_score : ClosedUnitInterval = 1.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
-Bases: Filter
-Filter to keep image pairs with similarities between images
-within a specific range.
-
-
-__init__ ( hf_clip = 'openai/clip-vit-base-patch32' , trust_remote_code = False , min_score : ClosedUnitInterval = 0.1 , max_score : ClosedUnitInterval = 1.0 , any_or_all : str = 'any' , * args , ** kwargs ) [source]
-Initialization method.
-
-
-param hf_clip:
-clip model name on huggingface to compute
-the similarity between image and text.
-
-param min_score:
-The min similarity to keep samples.
-
-param max_score:
-The max similarity to keep samples.
-
-param any_or_all:
-keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
-condition.
-
-param args:
-extra args
-
-param kwargs:
-extra args
-
-
-
-
-
-
-
-compute_stats_single ( sample , rank = None , context = False ) [source]
-Compute stats for the sample which is used as a metric to decide
-whether to filter this sample.
-
-Parameters:
-
-
-Returns:
-sample with computed stats
-
-
-
+
+compute_stats_batched ( samples , context = False ) [source]
+
-
-process_single ( sample , rank = None ) [source]
-For sample level, sample –> Boolean.
-
-Parameters:
-sample – sample to decide whether to filter
-
-Returns:
-true for keeping and false for filtering
-
-
-
+
+process_batched ( samples ) [source]
+
diff --git a/data_juicer.ops.mapper.html b/data_juicer.ops.mapper.html
index baf51b62e..16326ffcb 100644
--- a/data_juicer.ops.mapper.html
+++ b/data_juicer.ops.mapper.html
@@ -46,53 +46,55 @@
data_juicer.ops
data_juicer.ops.filter
data_juicer.ops.mapper
data_juicer.ops.deduplicator
@@ -130,53 +132,22 @@
data_juicer.ops.mapper
-
-class data_juicer.ops.mapper. VideoCaptioningFromAudioMapper ( keep_original_sample : bool = True , * args , ** kwargs ) [source]
-Bases: Mapper
-Mapper to caption a video according to its audio streams based on
-Qwen-Audio model.
-
-
-__init__ ( keep_original_sample : bool = True , * args , ** kwargs ) [source]
-Initialization method.
-
-Parameters:
-
-
-
-
-
-
-
-process_batched ( samples , rank = None ) [source]
-
-
-
-
-
-
-class data_juicer.ops.mapper. VideoTaggingFromAudioMapper ( hf_ast : str = 'MIT/ast-finetuned-audioset-10-10-0.4593' , trust_remote_code : bool = False , tag_field_name : str = '__dj__video_audio_tags__' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. AudioFFmpegWrappedMapper ( filter_name : str | None = None , filter_kwargs : Dict | None = None , global_args : List [ str ] | None = None , capture_stderr : bool = True , overwrite_output : bool = True , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to generate video tags from audio streams extracted by video
-using the Audio Spectrogram Transformer.
+Simple wrapper for FFmpeg audio filters.
-
-__init__ ( hf_ast : str = 'MIT/ast-finetuned-audioset-10-10-0.4593' , trust_remote_code : bool = False , tag_field_name : str = '__dj__video_audio_tags__' , * args , ** kwargs ) [source]
+
+__init__ ( filter_name : str | None = None , filter_kwargs : Dict | None = None , global_args : List [ str ] | None = None , capture_stderr : bool = True , overwrite_output : bool = True , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_ast – path to the HF model to tag from audios.
-trust_remote_code – whether to trust the remote code of HF models
-tag_field_name – the field name to store the tags. It’s
-“__dj__video_audio_tags__” in default.
+filter_name – ffmpeg audio filter name.
+filter_kwargs – keyword-arguments passed to ffmpeg filter.
+global_args – list-arguments passed to ffmpeg command-line.
+capture_stderr – whether to capture stderr.
+overwrite_output – whether to overwrite output file.
args – extra args
kwargs – extra args
@@ -185,8 +156,8 @@
-
-process_single ( sample , rank = None ) [source]
+
+process_single ( sample ) [source]
For sample level, sample –> sample
Parameters:
@@ -201,44 +172,42 @@
-
-class data_juicer.ops.mapper. ImageCaptioningFromGPT4VMapper ( mode : str = 'description' , api_key : str = '' , max_token : int = 500 , temperature : float [ float ] = 1.0 , system_prompt : str = '' , user_prompt : str = '' , user_prompt_key : str | None = None , keep_original_sample : bool = True , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. ChineseConvertMapper ( mode : str = 's2t' , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to generate samples whose texts are generated based on
-gpt-4-visison and the image.
+Mapper to convert Chinese between Traditional Chinese, Simplified Chinese
+and Japanese Kanji.
-
-__init__ ( mode : str = 'description' , api_key : str = '' , max_token : int = 500 , temperature : float [ float ] = 1.0 , system_prompt : str = '' , user_prompt : str = '' , user_prompt_key : str | None = None , keep_original_sample : bool = True , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+
+__init__ ( mode : str = 's2t' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-mode – mode of text generated from images, can be one of
-[‘resoning’, ‘description’, ‘conversation’, ‘custom’]
-api_key – the API key to authenticate the request.
-max_token – the maximum number of tokens to generate.
-Default is 500.
-temperature – controls the randomness of the output (range
-from 0 to 1). Default is 0.
-system_prompt – a string prompt used to set the context of a
-conversation and provide global guidance or rules for the
-gpt4-vision so that it can generate responses in the expected way.
-If mode set to custom , the parameter will be used.
-user_prompt – a string prompt to guide the generation of
-gpt4-vision for each samples. It’s “” in default, which means no
-prompt provided.
-uers_prompt_key – the key name of fields in samples to store
-prompts for each sample. It’s used for set different prompts for
-different samples. If it’s none, use prompt in parameter “prompt”.
-It’s None in default.
-keep_original_sample – whether to keep the original sample. If
-it’s set to False, there will be only generated text in the
-final datasets and the original text will be removed. It’s True
-in default.
-any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
-all images. ‘any’: keep this sample if any images meet the
-condition. ‘all’: keep this sample only if all images meet the
-condition.
+mode –
Choose the mode to convert Chinese:
+s2t: Simplified Chinese to Traditional Chinese,
+t2s: Traditional Chinese to Simplified Chinese,
+s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),
+tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,
+s2hk: Simplified Chinese to Traditional Chinese
+(Hong Kong variant),
+hk2s: Traditional Chinese (Hong Kong variant) to Simplified
+Chinese,
+s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard)
+with Taiwanese idiom,
+tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese
+with Mainland Chinese idiom,
+t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),
+tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,
+hk2t: Traditional Chinese (Hong Kong variant) to Traditional
+Chinese,
+t2hk: Traditional Chinese to Traditional Chinese
+(Hong Kong variant),
+t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese
+Kanji,
+jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese
+Characters,
+
args – extra args
kwargs – extra args
@@ -247,21 +216,21 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. PunctuationNormalizationMapper ( * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. CleanCopyrightMapper ( * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to normalize unicode punctuations to English punctuations in text
+
Mapper to clean copyright comments at the beginning of the text
samples.
-
-__init__ ( * args , ** kwargs ) [source]
+
+__init__ ( * args , ** kwargs ) [source]
Initialization method.
Parameters:
@@ -274,25 +243,26 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. RemoveBibliographyMapper ( * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. CleanEmailMapper ( pattern : str | None = None , repl : str = '' , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to remove bibliography at the end of documents in Latex
-samples.
+Mapper to clean email in text samples.
-
-__init__ ( * args , ** kwargs ) [source]
+
+__init__ ( pattern : str | None = None , repl : str = '' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
+pattern – regular expression pattern to search for within text.
+repl – replacement string, default is empty string.
args – extra args
kwargs – extra args
@@ -301,25 +271,24 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. SentenceSplitMapper ( lang : str = 'en' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. CleanHtmlMapper ( * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to split text samples to sentences.
+Mapper to clean html code in text samples.
-
-__init__ ( lang : str = 'en' , * args , ** kwargs ) [source]
+
+__init__ ( * args , ** kwargs ) [source]
Initialization method.
Parameters:
@@ -328,71 +297,26 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. VideoSplitBySceneMapper ( detector : str = 'ContentDetector' , threshold : float [ float ] = 27.0 , min_scene_len : int [ int ] = 15 , show_progress : bool = False , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. CleanIpMapper ( pattern : str | None = None , repl : str = '' , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to cut videos into scene clips.
-
-
-avaliable_detectors = {'AdaptiveDetector': ['window_width', 'min_content_val', 'weights', 'luma_only', 'kernel_size', 'video_manager', 'min_delta_hsv'], 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], 'ThresholdDetector': ['fade_bias', 'add_final_scene', 'method', 'block_size']}
-
-
+Mapper to clean ipv4 and ipv6 address in text samples.
-
-__init__ ( detector : str = 'ContentDetector' , threshold : float [ float ] = 27.0 , min_scene_len : int [ int ] = 15 , show_progress : bool = False , * args , ** kwargs ) [source]
+
+__init__ ( pattern : str | None = None , repl : str = '' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-detector – Algorithm from scenedetect.detectors . Should be one
-of [‘ContentDetector’, ‘ThresholdDetector’, ‘AdaptiveDetector`].
-threshold – Threshold passed to the detector.
-min_scene_len – Minimum length of any scene.
-show_progress – Whether to show progress from scenedetect.
-args – extra args
-kwargs – extra args
-
-
-
-
-
-
-
-process_single ( sample , context = False ) [source]
-For sample level, sample –> sample
-
-Parameters:
-sample – sample to process
-
-Returns:
-processed sample
-
-
-
-
-
-
-
-
-class data_juicer.ops.mapper. CleanIpMapper ( pattern : str | None = None , repl : str = '' , * args , ** kwargs ) [source]
-Bases: Mapper
-Mapper to clean ipv4 and ipv6 address in text samples.
-
-
-__init__ ( pattern : str | None = None , repl : str = '' , * args , ** kwargs ) [source]
-Initialization method.
-
-Parameters:
-
-pattern – regular expression pattern to search for within text.
-repl – replacement string, default is empty string.
+pattern – regular expression pattern to search for within text.
+repl – replacement string, default is empty string.
args – extra args
kwargs – extra args
@@ -436,20 +360,18 @@
-
+
+class data_juicer.ops.mapper. ExpandMacroMapper ( * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to remove headers at the beginning of documents in Latex
+
Mapper to expand macro definitions in the document body of Latex
samples.
-
+
+__init__ ( * args , ** kwargs ) [source]
Initialization method.
Parameters:
@@ -458,28 +380,27 @@
-
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. RemoveTableTextMapper ( min_col : int [ int ] = 2 , max_col : int [ int ] = 20 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. FixUnicodeMapper ( normalization : str | None = None , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to remove table texts from text samples.
-Regular expression is used to remove tables in the range of column
-number of tables.
+Mapper to fix unicode errors in text samples.
-
-__init__ ( min_col : int [ int ] = 2 , max_col : int [ int ] = 20 , * args , ** kwargs ) [source]
+
+__init__ ( normalization : str | None = None , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_col – The min number of columns of table to remove.
-max_col – The max number of columns of table to remove.
+normalization – the specified form of Unicode
+normalization mode, which can be one of
+[‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’], default ‘NFC’.
args – extra args
kwargs – extra args
@@ -488,56 +409,104 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. VideoRemoveWatermarkMapper ( roi_strings : List [ str ] = ['0,0,0.1,0.1'] , roi_type : str = 'ratio' , roi_key : str | None = None , frame_num : int [ int ] = 10 , min_frame_threshold : int [ int ] = 7 , detection_method : str = 'pixel_value' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. GenerateQAFromExamplesMapper ( hf_model : str = 'Qwen/Qwen2.5-7B-Instruct' , * , seed_file : str = '' , example_num : int [ int ] = 3 , similarity_threshold : float = 0.7 , system_prompt : str | None = None , input_template : str | None = None , example_template : str | None = None , qa_pair_template : str | None = None , output_pattern : str | None = None , enable_vllm : bool = False , model_params : Dict | None = None , sampling_params : Dict | None = None , ** kwargs ) [source]
Bases: Mapper
-Remove the watermarks in videos given regions.
+Mapper to generate question and answer pairs from examples.
+You should configure an empty dataset in your yaml config file:
+`` `
+generated_dataset_config:
+
+type: ‘EmptyFormatter’ # use RayEmptyFormatter when enable ray
+length: ${The number of generated samples}
+feature_keys: ${text key}
+
+`` `
+The number of samples generated is determined by
+the length of the empty dataset.
+
+
+DEFAULT_SYSTEM_PROMPT = '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求:\n1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n'
+
+
+
+
+DEFAULT_INPUT_TEMPLATE = '{}'
+
+
+
+
+DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}'
+
+
+
+
+DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n'
+
+
+
+
+DEFAULT_OUTPUT_PATTERN = '【问题】(.*?)【回答】(.*?)(?=【问题】|$)'
+
+
-
-__init__ ( roi_strings : List [ str ] = ['0,0,0.1,0.1'] , roi_type : str = 'ratio' , roi_key : str | None = None , frame_num : int [ int ] = 10 , min_frame_threshold : int [ int ] = 7 , detection_method : str = 'pixel_value' , * args , ** kwargs ) [source]
+
+__init__ ( hf_model : str = 'Qwen/Qwen2.5-7B-Instruct' , * , seed_file : str = '' , example_num : int [ int ] = 3 , similarity_threshold : float = 0.7 , system_prompt : str | None = None , input_template : str | None = None , example_template : str | None = None , qa_pair_template : str | None = None , output_pattern : str | None = None , enable_vllm : bool = False , model_params : Dict | None = None , sampling_params : Dict | None = None , ** kwargs ) [source]
Initialization method.
Parameters:
-roi_strings – a given list of regions the watermarks locate.
-The format of each can be “x1, y1, x2, y2”, “(x1, y1, x2, y2)”,
-or “[x1, y1, x2, y2]”.
-roi_type – the roi string type. When the type is ‘pixel’, (x1,
-y1), (x2, y2) are the locations of pixels in the top left corner
-and the bottom right corner respectively. If the roi_type is
-‘ratio’, the coordinates are normalized by wights and heights.
-roi_key – the key name of fields in samples to store roi_strings
-for each sample. It’s used for set different rois for different
-samples. If it’s none, use rois in parameter “roi_strings”.
-It’s None in default.
-frame_num – the number of frames to be extracted uniformly from
-the video to detect the pixels of watermark.
-min_frame_threshold – a coodination is considered as the
-location of a watermark pixel when it is that in no less
-min_frame_threshold frames.
-detection_method – the method to detect the pixels of watermark.
-If it is ‘pixel_value’, we consider the distribution of pixel
-value in each frame. If it is ‘pixel_diversity’, we will consider
-the pixel diversity in different frames. The min_frame_threshold
-is useless and frame_num must be greater than 1 in
-‘pixel_diversity’ mode.
-args – extra args
-kwargs – extra args
+hf_model – Hugginface model ID.
+seed_file – Path to the seed file in chatml format.
+example_num – The number of selected examples.
+Randomly select N examples from “seed_file” and
+put them into prompt as QA examples.
+similarity_threshold – The similarity score threshold
+between the generated samples and the seed examples.
+Range from 0 to 1. Samples with similarity score less than
+this threshold will be kept.
+system_prompt – System prompt for guiding the generation task.
+input_template – Template for building the input prompt. It must
+include one placeholder ‘{}’, which will be replaced by
+example_num formatted examples defined by example_template .
+example_template – Template for formatting one QA example. It
+must include one placeholder ‘{}’, which will be replaced by one
+formatted qa_pair.
+qa_pair_template – Template for formatting a single QA pair
+within each example. Must include two placeholders ‘{}’ for the
+question and answer.
+output_pattern – Regular expression pattern to extract questions
+and answers from model response.
+enable_vllm – Whether to use vllm for inference acceleration.
+model_params – Parameters for initializing the model.
+sampling_params – Sampling parameters for text generation.
+e.g {‘temperature’: 0.9, ‘top_p’: 0.95}
+kwargs – Extra keyword arguments.
-
-process_single ( sample , context = False ) [source]
+
+build_input ( qa_examples ) [source]
+
+
+
+
+parse_output ( raw_output ) [source]
+
+
+
+
+process_single ( sample = None , rank = None ) [source]
For sample level, sample –> sample
Parameters:
@@ -552,102 +521,308 @@
-
-class data_juicer.ops.mapper. RemoveRepeatSentencesMapper ( lowercase : bool = False , ignore_special_character : bool = True , min_repeat_sentence_length : int = 2 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. GenerateQAFromTextMapper ( hf_model : str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa' , * , output_pattern : str | None = None , enable_vllm : bool = False , model_params : Dict | None = None , sampling_params : Dict | None = None , ** kwargs ) [source]
Bases: Mapper
-Mapper to remove repeat sentences in text samples.
+Mapper to generate question and answer pairs from text.
+Recommended model list: [
+
+‘alibaba-pai/pai-llama3-8b-doc2qa’,
+‘alibaba-pai/pai-baichuan2-7b-doc2qa’,
+‘alibaba-pai/pai-qwen1_5-4b-doc2qa’,
+‘alibaba-pai/pai-qwen1_5-7b-doc2qa’,
+‘alibaba-pai/pai-qwen1_5-1b8-doc2qa’,
+‘alibaba-pai/pai-qwen1_5-0b5-doc2qa’
+
+]
+These recommended models are all trained with Chinese data
+and are suitable for Chinese.
-
-__init__ ( lowercase : bool = False , ignore_special_character : bool = True , min_repeat_sentence_length : int = 2 , * args , ** kwargs ) [source]
+
+__init__ ( hf_model : str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa' , * , output_pattern : str | None = None , enable_vllm : bool = False , model_params : Dict | None = None , sampling_params : Dict | None = None , ** kwargs ) [source]
Initialization method.
Parameters:
-lowercase – Whether to convert sample text to lower case
-ignore_special_character – Whether to ignore special
-characters when judging repeated sentences. Special characters
-are all characters except Chinese characters, letters and
-numbers.
-min_repeat_sentence_length – Sentences shorter than this
-length will not be deduplicated. If ignore_special_character is
-set to True, then special characters are not included in this
-length.
-args – extra args
-kwargs – extra args
+hf_model – Hugginface model ID.
+output_pattern – Regular expression pattern to extract
+questions and answers from model response.
+enable_vllm – Whether to use vllm for inference acceleration.
+model_params – Parameters for initializing the model.
+sampling_params – Sampling parameters for text generation,
+e.g {‘temperature’: 0.9, ‘top_p’: 0.95}
+kwargs – Extra keyword arguments.
+The default data format parsed by this interface is as follows:
+Model Input:
+
+蒙古国的首都是乌兰巴托(Ulaanbaatar)
+冰岛的首都是雷克雅未克(Reykjavik)
+
+
+Model Output: 蒙古国的首都是乌兰巴托(Ulaanbaatar)
+冰岛的首都是雷克雅未克(Reykjavik)
+Human: 请问蒙古国的首都是哪里?
+Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。
+Human: 冰岛的首都是哪里呢?
+Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。
+…
+
+
-
-process_batched ( samples ) [source]
+
+parse_output ( raw_output ) [source]
+
+
+
+
+process_batched ( samples , rank = None ) [source]
-
-class data_juicer.ops.mapper. ImageDiffusionMapper ( hf_diffusion : str = 'CompVis/stable-diffusion-v1-4' , trust_remote_code : bool = False , torch_dtype : str = 'fp32' , revision : str = 'main' , strength : float [ float ] = 0.8 , guidance_scale : float = 7.5 , aug_num : int [ int ] = 1 , keep_original_sample : bool = True , caption_key : str | None = None , hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. ImageBlurMapper ( p : float = 0.2 , blur_type : str = 'gaussian' , radius : float = 2 , * args , ** kwargs ) [source]
Bases: Mapper
-Generate image by diffusion model
+Mapper to blur images.
-
-__init__ ( hf_diffusion : str = 'CompVis/stable-diffusion-v1-4' , trust_remote_code : bool = False , torch_dtype : str = 'fp32' , revision : str = 'main' , strength : float [ float ] = 0.8 , guidance_scale : float = 7.5 , aug_num : int [ int ] = 1 , keep_original_sample : bool = True , caption_key : str | None = None , hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , * args , ** kwargs ) [source]
+
+__init__ ( p : float = 0.2 , blur_type : str = 'gaussian' , radius : float = 2 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_diffusion – diffusion model name on huggingface to generate
-the image.
-torch_dtype – the floating point type used to load the diffusion
-model. Can be one of [‘fp32’, ‘fp16’, ‘bf16’]
-revision – The specific model version to use. It can be a
-branch name, a tag name, a commit id, or any identifier allowed
-by Git.
-strength – Indicates extent to transform the reference image.
-Must be between 0 and 1. image is used as a starting point and
-more noise is added the higher the strength. The number of
-denoising steps depends on the amount of noise initially added.
-When strength is 1, added noise is maximum and the denoising
-process runs for the full number of iterations specified in
-num_inference_steps. A value of 1 essentially ignores image.
-guidance_scale – A higher guidance scale value encourages the
-model to generate images closely linked to the text prompt at the
-expense of lower image quality. Guidance scale is enabled when
-guidance_scale > 1.
-aug_num – The image number to be produced by stable-diffusion
-model.
-keep_candidate_mode –
retain strategy for the generated
-$caption_num$ candidates.
-’random_any’: Retain the random one from generated captions
-
-’similar_one_simhash’: Retain the generated one that is most similar to the original caption
-
-
-’all’: Retain all generated captions by concatenation
-
+p – Probability of the image being blured.
+blur_type – Type of blur kernel, including
+[‘mean’, ‘box’, ‘gaussian’].
+radius – Radius of blur kernel.
+args – extra args
+kwargs – extra args
-
-
Note
-
This is a batched_OP, whose input and output type are
-both list. Suppose there are $N$ list of input samples, whose batch
-size is $b$, and denote caption_num as $M$.
-The number of total samples after generation is $2Nb$ when
-keep_original_sample is True and $Nb$ when keep_original_sample is
-False. For ‘random_any’ and ‘similar_one_simhash’ mode,
-it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True
-and $MNb$ when keep_original_sample is False.
-
+
+
+
+
+process_single ( sample , context = False ) [source]
+For sample level, sample –> sample
Parameters:
-
-caption_key – the key name of fields in samples to store captions
-for each images. It can be a string if there is only one image in
-each sample. Otherwise, it should be a list. If it’s none,
-ImageDiffusionMapper will produce captions for each images.
+sample – sample to process
+
+Returns:
+processed sample
+
+
+
+
+
+
+
+
+class data_juicer.ops.mapper. ImageCaptioningFromGPT4VMapper ( mode : str = 'description' , api_key : str = '' , max_token : int = 500 , temperature : float [ float ] = 1.0 , system_prompt : str = '' , user_prompt : str = '' , user_prompt_key : str | None = None , keep_original_sample : bool = True , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+Bases: Mapper
+Mapper to generate samples whose texts are generated based on
+gpt-4-visison and the image.
+
+
+__init__ ( mode : str = 'description' , api_key : str = '' , max_token : int = 500 , temperature : float [ float ] = 1.0 , system_prompt : str = '' , user_prompt : str = '' , user_prompt_key : str | None = None , keep_original_sample : bool = True , any_or_all : str = 'any' , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+mode – mode of text generated from images, can be one of
+[‘resoning’, ‘description’, ‘conversation’, ‘custom’]
+api_key – the API key to authenticate the request.
+max_token – the maximum number of tokens to generate.
+Default is 500.
+temperature – controls the randomness of the output (range
+from 0 to 1). Default is 0.
+system_prompt – a string prompt used to set the context of a
+conversation and provide global guidance or rules for the
+gpt4-vision so that it can generate responses in the expected way.
+If mode set to custom , the parameter will be used.
+user_prompt – a string prompt to guide the generation of
+gpt4-vision for each samples. It’s “” in default, which means no
+prompt provided.
+uers_prompt_key – the key name of fields in samples to store
+prompts for each sample. It’s used for set different prompts for
+different samples. If it’s none, use prompt in parameter “prompt”.
+It’s None in default.
+keep_original_sample – whether to keep the original sample. If
+it’s set to False, there will be only generated text in the
+final datasets and the original text will be removed. It’s True
+in default.
+any_or_all – keep this sample with ‘any’ or ‘all’ strategy of
+all images. ‘any’: keep this sample if any images meet the
+condition. ‘all’: keep this sample only if all images meet the
+condition.
+args – extra args
+kwargs – extra args
+
+
+
+
+
+
+
+process_batched ( samples ) [source]
+
+
+
+
+
+
+class data_juicer.ops.mapper. ImageCaptioningMapper ( hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , * args , ** kwargs ) [source]
+Bases: Mapper
+Mapper to generate samples whose captions are generated based on
+another model and the figure.
+
+
+__init__ ( hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+hf_img2seq – model name on huggingface to generate caption
+caption_num – how many candidate captions to generate
+for each image
+keep_candidate_mode –
retain strategy for the generated
+$caption_num$ candidates.
+’random_any’: Retain the random one from generated captions
+
+’similar_one_simhash’: Retain the generated one that is most similar to the original caption
+
+
+’all’: Retain all generated captions by concatenation
+
+
+
+
+
+
Note
+
This is a batched_OP, whose input and output type are
+both list. Suppose there are $N$ list of input samples, whose batch
+size is $b$, and denote caption_num as $M$.
+The number of total samples after generation is $2Nb$ when
+keep_original_sample is True and $Nb$ when keep_original_sample is
+False. For ‘random_any’ and ‘similar_one_simhash’ mode,
+it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True
+and $MNb$ when keep_original_sample is False.
+
+
+Parameters:
+
+keep_original_sample – whether to keep the original sample. If
+it’s set to False, there will be only generated captions in the
+final datasets and the original captions will be removed. It’s True
+in default.
+prompt – a string prompt to guide the generation of blip2 model
+for all samples globally. It’s None in default, which means no
+prompt provided.
+prompt_key – the key name of fields in samples to store prompts
+for each sample. It’s used for set different prompts for different
+samples. If it’s none, use prompt in parameter “prompt”. It’s None
+in default.
+args – extra args
+kwargs – extra args
+
+
+
+
+
+
+
+process_batched ( samples , rank = None ) [source]
+
+
Note
+
This is a batched_OP, whose input and output type are
+both list. Suppose there are $N$ input sample list with batch
+size as $b$, and denote caption_num as $M$.
+the number of total samples after generation is $2Nb$
+for ‘random_any’ and ‘similar_one’ mode,
+and $(1+M)Nb$ for ‘all’ mode.
+
+
+Parameters:
+samples –
+
+Returns:
+
+
+
+
+
+
+
+
+
+class data_juicer.ops.mapper. ImageDiffusionMapper ( hf_diffusion : str = 'CompVis/stable-diffusion-v1-4' , trust_remote_code : bool = False , torch_dtype : str = 'fp32' , revision : str = 'main' , strength : float [ float ] = 0.8 , guidance_scale : float = 7.5 , aug_num : int [ int ] = 1 , keep_original_sample : bool = True , caption_key : str | None = None , hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , * args , ** kwargs ) [source]
+Bases: Mapper
+Generate image by diffusion model
+
+
+__init__ ( hf_diffusion : str = 'CompVis/stable-diffusion-v1-4' , trust_remote_code : bool = False , torch_dtype : str = 'fp32' , revision : str = 'main' , strength : float [ float ] = 0.8 , guidance_scale : float = 7.5 , aug_num : int [ int ] = 1 , keep_original_sample : bool = True , caption_key : str | None = None , hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+hf_diffusion – diffusion model name on huggingface to generate
+the image.
+torch_dtype – the floating point type used to load the diffusion
+model. Can be one of [‘fp32’, ‘fp16’, ‘bf16’]
+revision – The specific model version to use. It can be a
+branch name, a tag name, a commit id, or any identifier allowed
+by Git.
+strength – Indicates extent to transform the reference image.
+Must be between 0 and 1. image is used as a starting point and
+more noise is added the higher the strength. The number of
+denoising steps depends on the amount of noise initially added.
+When strength is 1, added noise is maximum and the denoising
+process runs for the full number of iterations specified in
+num_inference_steps. A value of 1 essentially ignores image.
+guidance_scale – A higher guidance scale value encourages the
+model to generate images closely linked to the text prompt at the
+expense of lower image quality. Guidance scale is enabled when
+guidance_scale > 1.
+aug_num – The image number to be produced by stable-diffusion
+model.
+keep_candidate_mode –
retain strategy for the generated
+$caption_num$ candidates.
+’random_any’: Retain the random one from generated captions
+
+’similar_one_simhash’: Retain the generated one that is most similar to the original caption
+
+
+’all’: Retain all generated captions by concatenation
+
+
+
+
+
+
Note
+
This is a batched_OP, whose input and output type are
+both list. Suppose there are $N$ list of input samples, whose batch
+size is $b$, and denote caption_num as $M$.
+The number of total samples after generation is $2Nb$ when
+keep_original_sample is True and $Nb$ when keep_original_sample is
+False. For ‘random_any’ and ‘similar_one_simhash’ mode,
+it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True
+and $MNb$ when keep_original_sample is False.
+
+
+Parameters:
+
+caption_key – the key name of fields in samples to store captions
+for each images. It can be a string if there is only one image in
+each sample. Otherwise, it should be a list. If it’s none,
+ImageDiffusionMapper will produce captions for each images.
hf_img2seq – model name on huggingface to generate caption if
caption_key is None.
@@ -718,22 +893,21 @@
-
-class data_juicer.ops.mapper. VideoFFmpegWrappedMapper ( filter_name : str | None = None , filter_kwargs : Dict | None = None , global_args : List [ str ] | None = None , capture_stderr : bool = True , overwrite_output : bool = True , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. ImageTaggingMapper ( tag_field_name : str = '__dj__image_tags__' , * args , ** kwargs ) [source]
Bases: Mapper
-Simple wrapper for FFmpeg video filters.
+Mapper to generate image tags.
-
-__init__ ( filter_name : str | None = None , filter_kwargs : Dict | None = None , global_args : List [ str ] | None = None , capture_stderr : bool = True , overwrite_output : bool = True , * args , ** kwargs ) [source]
-Initialization method.
+
+__init__ ( tag_field_name : str = '__dj__image_tags__' , * args , ** kwargs ) [source]
+Initialization method.
+:param tag_field_name: the field name to store the tags. It’s
+
+“__dj__image_tags__” in default.
+
Parameters:
-filter_name – ffmpeg video filter name.
-filter_kwargs – keyword-arguments passed to ffmpeg filter.
-global_args – list-arguments passed to ffmpeg command-line.
-capture_stderr – whether to capture stderr.
-overwrite_output – whether to overwrite output file.
args – extra args
kwargs – extra args
@@ -742,8 +916,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample , rank = None , context = False ) [source]
For sample level, sample –> sample
Parameters:
@@ -757,43 +931,61 @@
-
-
-class data_juicer.ops.mapper. ChineseConvertMapper ( mode : str = 's2t' , * args , ** kwargs ) [source]
-Bases: Mapper
-Mapper to convert Chinese between Traditional Chinese, Simplified Chinese
-and Japanese Kanji.
-
-
-__init__ ( mode : str = 's2t' , * args , ** kwargs ) [source]
-Initialization method.
-
-Parameters:
-
-mode –
Choose the mode to convert Chinese:
-s2t: Simplified Chinese to Traditional Chinese,
-t2s: Traditional Chinese to Simplified Chinese,
-s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),
-tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,
-s2hk: Simplified Chinese to Traditional Chinese
-(Hong Kong variant),
-hk2s: Traditional Chinese (Hong Kong variant) to Simplified
-Chinese,
-s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard)
-with Taiwanese idiom,
-tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese
-with Mainland Chinese idiom,
-t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),
-tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,
-hk2t: Traditional Chinese (Hong Kong variant) to Traditional
-Chinese,
-t2hk: Traditional Chinese to Traditional Chinese
-(Hong Kong variant),
-t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese
-Kanji,
-jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese
-Characters,
-
+
+
+class data_juicer.ops.mapper. NlpaugEnMapper ( sequential : bool = False , aug_num : int [ int ] = 1 , keep_original_sample : bool = True , delete_random_word : bool = False , swap_random_word : bool = False , spelling_error_word : bool = False , split_random_word : bool = False , keyboard_error_char : bool = False , ocr_error_char : bool = False , delete_random_char : bool = False , swap_random_char : bool = False , insert_random_char : bool = False , * args , ** kwargs ) [source]
+Bases: Mapper
+Mapper to simply augment samples in English based on nlpaug library.
+
+
+__init__ ( sequential : bool = False , aug_num : int [ int ] = 1 , keep_original_sample : bool = True , delete_random_word : bool = False , swap_random_word : bool = False , spelling_error_word : bool = False , split_random_word : bool = False , keyboard_error_char : bool = False , ocr_error_char : bool = False , delete_random_char : bool = False , swap_random_char : bool = False , insert_random_char : bool = False , * args , ** kwargs ) [source]
+Initialization method. All augmentation methods use default parameters
+in default. We recommend you to only use 1-3 augmentation methods at a
+time. Otherwise, the semantics of samples might be changed
+significantly.
+
+Parameters:
+
+sequential – whether combine all augmentation methods to a
+sequence. If it’s True, a sample will be augmented by all opened
+augmentation methods sequentially. If it’s False, each opened
+augmentation method would generate its augmented samples
+independently.
+aug_num – number of augmented samples to be generated. If
+sequential is True, there will be total aug_num augmented samples
+generated. If it’s False, there will be (aug_num *
+#opened_aug_method) augmented samples generated.
+keep_original_sample – whether to keep the original sample. If
+it’s set to False, there will be only generated texts in the final
+datasets and the original texts will be removed. It’s True in
+default.
+delete_random_word – whether to open the augmentation method of
+deleting random words from the original texts. e.g. “I love LLM”
+–> “I LLM”
+swap_random_word – whether to open the augmentation method of
+swapping random contiguous words in the original texts. e.g. “I
+love LLM” –> “Love I LLM”
+spelling_error_word – whether to open the augmentation method of
+simulating the spelling error for words in the original texts. e.g.
+“I love LLM” –> “Ai love LLM”
+split_random_word – whether to open the augmentation method of
+splitting words randomly with whitespaces in the original texts.
+e.g. “I love LLM” –> “I love LL M”
+keyboard_error_char – whether to open the augmentation method of
+simulating the keyboard error for characters in the original texts.
+e.g. “I love LLM” –> “I ;ov4 LLM”
+ocr_error_char – whether to open the augmentation method of
+simulating the OCR error for characters in the original texts.
+e.g. “I love LLM” –> “I 10ve LLM”
+delete_random_char – whether to open the augmentation method of
+deleting random characters from the original texts. e.g. “I love
+LLM” –> “I oe LLM”
+swap_random_char – whether to open the augmentation method of
+swapping random contiguous characters in the original texts.
+e.g. “I love LLM” –> “I ovle LLM”
+insert_random_char – whether to open the augmentation method of
+inserting random characters into the original texts. e.g. “I love
+LLM” –> “I ^lKove LLM”
args – extra args
kwargs – extra args
@@ -802,8 +994,8 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
@@ -868,88 +1060,71 @@
-
-class data_juicer.ops.mapper. OptimizeInstructionMapper ( hf_model : str = 'alibaba-pai/Qwen2-7B-Instruct-Refine' , trust_remote_code : bool = False , system_prompt : str | None = None , enable_vllm : bool = True , tensor_parallel_size : int | None = None , max_model_len : int | None = None , max_num_seqs : int = 256 , sampling_params : Dict = {} , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. OptimizeQAMapper ( hf_model : str = 'Qwen/Qwen2.5-7B-Instruct' , * , system_prompt : str | None = None , input_template : str | None = None , qa_pair_template : str | None = None , output_pattern : str | None = None , enable_vllm : bool = False , model_params : Dict | None = None , sampling_params : Dict | None = None , ** kwargs ) [source]
Bases: Mapper
-Mapper to optimize instruction.
-Recommended model list: [
-
-alibaba-pai/Qwen2-1.5B-Instruct-Refine
-alibaba-pai/Qwen2-7B-Instruct-Refine
-
-]
-
-
-__init__ ( hf_model : str = 'alibaba-pai/Qwen2-7B-Instruct-Refine' , trust_remote_code : bool = False , system_prompt : str | None = None , enable_vllm : bool = True , tensor_parallel_size : int | None = None , max_model_len : int | None = None , max_num_seqs : int = 256 , sampling_params : Dict = {} , * args , ** kwargs ) [source]
-Initialization method.
-:param hf_model: Hugginface model id.
-:param trust_remote_code: passed to transformers
-:param system_prompt: System prompt for optimize samples.
-:param enable_vllm: Whether to use vllm for inference acceleration.
-:param tensor_parallel_size: It is only valid when enable_vllm is True.
-
-The number of GPUs to use for distributed execution with tensor
-parallelism.
-
-
-Parameters:
-
-max_model_len – It is only valid when enable_vllm is True.
-Model context length. If unspecified, will be automatically
-derived from the model config.
-max_num_seqs – It is only valid when enable_vllm is True.
-Maximum number of sequences to be processed in a single iteration.
-sampling_params – Sampling parameters for text generation.
-e.g {‘temperature’: 0.9, ‘top_p’: 0.95}
-args – extra args
-kwargs – extra args
-
-
-
-
+Mapper to optimize question-answer pairs.
+
+
+DEFAULT_SYSTEM_PROMPT = '请优化输入的问答对,使【问题】和【回答】都更加详细、准确。必须按照以下标记格式,直接输出优化后的问答对:\n【问题】\n优化后的问题\n【回答】\n优化后的回答'
+
-
-
-process_single ( sample = None , rank = None ) [source]
-For sample level, sample –> sample
-
-Parameters:
-sample – sample to process
-
-Returns:
-processed sample
-
-
-
+
+
+DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}'
+
-
+
+
+DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
+
+
+
+
+DEFAULT_OUTPUT_PATTERN = '.*?【问题】\\s*(.*?)\\s*【回答】\\s*(.*)'
+
-
-
-class data_juicer.ops.mapper. ImageBlurMapper ( p : float = 0.2 , blur_type : str = 'gaussian' , radius : float = 2 , * args , ** kwargs ) [source]
-Bases: Mapper
-Mapper to blur images.
-
-__init__ ( p : float = 0.2 , blur_type : str = 'gaussian' , radius : float = 2 , * args , ** kwargs ) [source]
+
+__init__ ( hf_model : str = 'Qwen/Qwen2.5-7B-Instruct' , * , system_prompt : str | None = None , input_template : str | None = None , qa_pair_template : str | None = None , output_pattern : str | None = None , enable_vllm : bool = False , model_params : Dict | None = None , sampling_params : Dict | None = None , ** kwargs ) [source]
Initialization method.
Parameters:
-p – Probability of the image being blured.
-blur_type – Type of blur kernel, including
-[‘mean’, ‘box’, ‘gaussian’].
-radius – Radius of blur kernel.
-args – extra args
-kwargs – extra args
+hf_model – Hugging Face model ID.
+system_prompt – System prompt for guiding the optimization task.
+input_template – Template for building the input for the model.
+Please make sure the template contains one placeholder ‘{}’, which
+corresponds to the question and answer pair generated by
+param qa_pair_template .
+qa_pair_template – Template for formatting the question and
+answer pair. Please make sure the template contains two
+‘{}’ to format question and answer.
+output_pattern – Regular expression pattern to extract question
+and answer from model response.
+enable_vllm – Whether to use VLLM for inference acceleration.
+model_params – Parameters for initializing the model.
+sampling_params – Sampling parameters for text generation (e.g.,
+{‘temperature’: 0.9, ‘top_p’: 0.95}).
+kwargs – Extra keyword arguments.
-
-process_single ( sample , context = False ) [source]
+
+build_input ( sample ) [source]
+
+
+
+
+parse_output ( raw_output ) [source]
+
+
+
+
+process_single ( sample = None , rank = None ) [source]
For sample level, sample –> sample
Parameters:
@@ -964,47 +1139,52 @@
-
-class data_juicer.ops.mapper. CleanCopyrightMapper ( * args , ** kwargs ) [source]
-Bases: Mapper
-Mapper to clean copyright comments at the beginning of the text
-samples.
+
+class data_juicer.ops.mapper. OptimizeQueryMapper ( hf_model : str = 'Qwen/Qwen2.5-7B-Instruct' , * , system_prompt : str | None = None , input_template : str | None = None , qa_pair_template : str | None = None , output_pattern : str | None = None , enable_vllm : bool = False , model_params : Dict | None = None , sampling_params : Dict | None = None , ** kwargs ) [source]
+Bases: OptimizeQAMapper
+Mapper to optimize query in question-answer pairs.
+
+
+DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。'
+
+
-
-__init__ ( * args , ** kwargs ) [source]
-Initialization method.
-
-Parameters:
-
-args – extra args
-kwargs – extra args
-
-
-
+
+parse_output ( raw_output ) [source]
+
+
+
+
+class data_juicer.ops.mapper. OptimizeResponseMapper ( hf_model : str = 'Qwen/Qwen2.5-7B-Instruct' , * , system_prompt : str | None = None , input_template : str | None = None , qa_pair_template : str | None = None , output_pattern : str | None = None , enable_vllm : bool = False , model_params : Dict | None = None , sampling_params : Dict | None = None , ** kwargs ) [source]
+Bases: OptimizeQAMapper
+Mapper to optimize response in question-answer pairs.
+
+
+DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。'
+
+
-
-process_batched ( samples ) [source]
+
+parse_output ( raw_output ) [source]
-
-class data_juicer.ops.mapper. RemoveNonChineseCharacterlMapper ( keep_alphabet : bool = True , keep_number : bool = True , keep_punc : bool = True , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. PunctuationNormalizationMapper ( * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to remove non chinese Character in text samples.
+Mapper to normalize unicode punctuations to English punctuations in text
+samples.
-
-__init__ ( keep_alphabet : bool = True , keep_number : bool = True , keep_punc : bool = True , * args , ** kwargs ) [source]
+
+__init__ ( * args , ** kwargs ) [source]
Initialization method.
Parameters:
-keep_alphabet – whether to keep alphabet
-keep_number – whether to keep number
-keep_punc – whether to keep punctuation
args – extra args
kwargs – extra args
@@ -1013,28 +1193,25 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. VideoSplitByKeyFrameMapper ( keep_original_sample : bool = True , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. RemoveBibliographyMapper ( * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to split video by key frame.
+Mapper to remove bibliography at the end of documents in Latex
+samples.
-
-__init__ ( keep_original_sample : bool = True , * args , ** kwargs ) [source]
+
+__init__ ( * args , ** kwargs ) [source]
Initialization method.
Parameters:
@@ -1043,31 +1220,28 @@
-
-get_split_key_frame ( video_key , container ) [source]
-
-
-
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. RemoveSpecificCharsMapper ( chars_to_remove : str | List [ str ] = '◆●■►▼▲▴∆▻▷❖♡□' , * args , ** kwargs ) [source]
+
Bases: Mapper
-Mapper to clean specific chars in text samples.
+Mapper to remove comments in different kinds of documents.
+Only support ‘tex’ for now.
-
-__init__ ( chars_to_remove : str | List [ str ] = '◆●■►▼▲▴∆▻▷❖♡□' , * args , ** kwargs ) [source]
+
Initialization method.
Parameters:
-chars_to_remove – a list or a string including all
-characters that need to be removed from text.
+doc_type – Type of document to remove comments.
+inline – Whether to remove inline comments.
+multiline – Whether to remove multiline comments.
args – extra args
kwargs – extra args
@@ -1076,77 +1250,57 @@
-
-process_batched ( samples ) [source]
+
-
-class data_juicer.ops.mapper. VideoResizeAspectRatioMapper ( min_ratio : str = '9/21' , max_ratio : str = '21/9' , strategy : str = 'increase' , * args , ** kwargs ) [source]
+
Bases: Mapper
-Mapper to resize videos by aspect ratio.
-AspectRatio = W / H.
-
-
-STRATEGY = ['decrease', 'increase']
-
-
+Mapper to remove headers at the beginning of documents in Latex
+samples.
-
-__init__ ( min_ratio : str = '9/21' , max_ratio : str = '21/9' , strategy : str = 'increase' , * args , ** kwargs ) [source]
+
Initialization method.
-Parameters:
-
-min_ratio – The minimum aspect ratio to enforce videos with
-an aspect ratio below min_ratio will be resized to match
-this minimum ratio. The ratio should be provided as a string
-in the format “9:21” or “9/21”.
-max_ratio – The maximum aspect ratio to enforce videos with
-an aspect ratio above max_ratio will be resized to match
-this maximum ratio. The ratio should be provided as a string
-in the format “21:9” or “21/9”.
-strategy – The resizing strategy to apply when adjusting the
-video dimensions. It can be either ‘decrease’ to reduce the
-dimension or ‘increase’ to enlarge it. Accepted values are
-[‘decrease’, ‘increase’].
-args – extra args
-kwargs – extra args
-
-
-
-
-
-
-
-process_single ( sample ) [source]
-For sample level, sample –> sample
-
-Parameters:
-sample – sample to process
-
-Returns:
-processed sample
+Parameters:
+
+
+
+
+
-
-class data_juicer.ops.mapper. CleanHtmlMapper ( * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. RemoveLongWordsMapper ( min_len : int = 1 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to clean html code in text samples.
+Mapper to remove long words within a specific range.
-
-__init__ ( * args , ** kwargs ) [source]
+
+__init__ ( min_len : int = 1 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
+min_len – The min mapper word length in this op, words
+will be filtered if their length is below this parameter.
+max_len – The max mapper word length in this op, words
+will be filtered if their length exceeds this parameter.
args – extra args
kwargs – extra args
@@ -1155,27 +1309,32 @@
-
-process_batched ( samples ) [source]
+
+should_keep_long_word ( word ) [source]
+
+
+
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. WhitespaceNormalizationMapper ( * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. RemoveNonChineseCharacterlMapper ( keep_alphabet : bool = True , keep_number : bool = True , keep_punc : bool = True , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to normalize different kinds of whitespaces to whitespace ‘ ‘ (0x20)
-in text samples.
-Different kinds of whitespaces can be found here:
-https://en.wikipedia.org/wiki/Whitespace_character
+Mapper to remove non chinese Character in text samples.
-
-__init__ ( * args , ** kwargs ) [source]
+
+__init__ ( keep_alphabet : bool = True , keep_number : bool = True , keep_punc : bool = True , * args , ** kwargs ) [source]
Initialization method.
Parameters:
+keep_alphabet – whether to keep alphabet
+keep_number – whether to keep number
+keep_punc – whether to keep punctuation
args – extra args
kwargs – extra args
@@ -1184,39 +1343,33 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. VideoTaggingFromFramesMapper ( frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , tag_field_name : str = '__dj__video_frame_tags__' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. RemoveRepeatSentencesMapper ( lowercase : bool = False , ignore_special_character : bool = True , min_repeat_sentence_length : int = 2 , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to generate video tags from frames extract by video.
+Mapper to remove repeat sentences in text samples.
-
-__init__ ( frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , tag_field_name : str = '__dj__video_frame_tags__' , * args , ** kwargs ) [source]
+
+__init__ ( lowercase : bool = False , ignore_special_character : bool = True , min_repeat_sentence_length : int = 2 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-frame_sampling_method – sampling method of extracting frame
-images from the videos. Should be one of
-[“all_keyframes”, “uniform”].
-The former one extracts all key frames (the number of which depends
-on the duration of the video) and the latter one extract specified
-number of frames uniformly from the video.
-Default: “all_keyframes”.
-frame_num – the number of frames to be extracted uniformly from
-the video. Only works when frame_sampling_method is “uniform”. If
-it’s 1, only the middle frame will be extracted. If it’s 2, only
-the first and the last frames will be extracted. If it’s larger
-than 2, in addition to the first and the last frames, other frames
-will be extracted uniformly within the video duration.
-tag_field_name – the field name to store the tags. It’s
-“__dj__video_frame_tags__” in default.
+lowercase – Whether to convert sample text to lower case
+ignore_special_character – Whether to ignore special
+characters when judging repeated sentences. Special characters
+are all characters except Chinese characters, letters and
+numbers.
+min_repeat_sentence_length – Sentences shorter than this
+length will not be deduplicated. If ignore_special_character is
+set to True, then special characters are not included in this
+length.
args – extra args
kwargs – extra args
@@ -1225,37 +1378,26 @@
-
-process_single ( sample , rank = None , context = False ) [source]
-For sample level, sample –> sample
-
-Parameters:
-sample – sample to process
-
-Returns:
-processed sample
-
-
-
+
+process_batched ( samples ) [source]
+
-
+
+class data_juicer.ops.mapper. RemoveSpecificCharsMapper ( chars_to_remove : str | List [ str ] = '◆●■►▼▲▴∆▻▷❖♡□' , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to remove comments in different kinds of documents.
-Only support ‘tex’ for now.
+Mapper to clean specific chars in text samples.
-
+
+__init__ ( chars_to_remove : str | List [ str ] = '◆●■►▼▲▴∆▻▷❖♡□' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-doc_type – Type of document to remove comments.
-inline – Whether to remove inline comments.
-multiline – Whether to remove multiline comments.
+chars_to_remove – a list or a string including all
+characters that need to be removed from text.
args – extra args
kwargs – extra args
@@ -1264,25 +1406,28 @@
-
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. ExpandMacroMapper ( * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. RemoveTableTextMapper ( min_col : int [ int ] = 2 , max_col : int [ int ] = 20 , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to expand macro definitions in the document body of Latex
-samples.
+Mapper to remove table texts from text samples.
+Regular expression is used to remove tables in the range of column
+number of tables.
-
-__init__ ( * args , ** kwargs ) [source]
+
+__init__ ( min_col : int [ int ] = 2 , max_col : int [ int ] = 20 , * args , ** kwargs ) [source]
Initialization method.
Parameters:
@@ -1291,145 +1436,62 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples ) [source]
-
+
+class data_juicer.ops.mapper. RemoveWordsWithIncorrectSubstringsMapper ( lang : str = 'en' , tokenization : bool = False , substrings : List [ str ] | None = None , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to extract question and answer pair from text samples.
-Recommended model list: [
-
-‘alibaba-pai/pai-llama3-8b-doc2qa’,
-‘alibaba-pai/pai-baichuan2-7b-doc2qa’,
-‘alibaba-pai/pai-qwen1_5-4b-doc2qa’,
-‘alibaba-pai/pai-qwen1_5-7b-doc2qa’,
-‘alibaba-pai/pai-qwen1_5-1b8-doc2qa’,
-‘alibaba-pai/pai-qwen1_5-0b5-doc2qa’
-
-]
-These recommended models are all trained with Chinese data
-and are suitable for Chinese.
+Mapper to remove words with incorrect substrings.
-
-Initialization method.
-:param hf_model: Hugginface model id.
-:param trust_remote_code: passed to transformers
-:param pattern: regular expression pattern to search for within text.
-:param qa_format: Output format of question and answer pair.
-:param enable_vllm: Whether to use vllm for inference acceleration.
-:param tensor_parallel_size: It is only valid when enable_vllm is True.
-
-The number of GPUs to use for distributed execution with tensor
-parallelism.
-
+
+__init__ ( lang : str = 'en' , tokenization : bool = False , substrings : List [ str ] | None = None , * args , ** kwargs ) [source]
+Initialization method.
Parameters:
-max_model_len – It is only valid when enable_vllm is True.
-Model context length. If unspecified, will be automatically
-derived from the model config.
-max_num_seqs – It is only valid when enable_vllm is True.
-Maximum number of sequences to be processed in a single iteration.
-sampling_params – Sampling parameters for text generation.
-e.g {‘temperature’: 0.9, ‘top_p’: 0.95}
+lang – sample in which language
+tokenization – whether to use model to tokenize documents
+substrings – The incorrect substrings in words.
args – extra args
kwargs – extra args
-The default data format parsed by this interface is as follows:
-Model Input:
-
-蒙古国的首都是乌兰巴托(Ulaanbaatar)
-冰岛的首都是雷克雅未克(Reykjavik)
-
-
-Model Output: 蒙古国的首都是乌兰巴托(Ulaanbaatar)
-冰岛的首都是雷克雅未克(Reykjavik)
-Human: 请问蒙古国的首都是哪里?
-Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。
-Human: 冰岛的首都是哪里呢?
-Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。
-…
-
-
-
-For sample level, sample –> sample
-
-Parameters:
-sample – sample to process
-
-Returns:
-processed sample
-
-
-
+
+should_keep_word_with_incorrect_substrings ( word , substrings ) [source]
+
+
+
+
+process_batched ( samples ) [source]
+
-
-class data_juicer.ops.mapper. ImageCaptioningMapper ( hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. ReplaceContentMapper ( pattern : str | List [ str ] | None = None , repl : str | List [ str ] = '' , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to generate samples whose captions are generated based on
-another model and the figure.
+Mapper to replace all content in the text that matches
+a specific regular expression pattern with a designated
+replacement string.
-
-__init__ ( hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , * args , ** kwargs ) [source]
+
+__init__ ( pattern : str | List [ str ] | None = None , repl : str | List [ str ] = '' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_img2seq – model name on huggingface to generate caption
-caption_num – how many candidate captions to generate
-for each image
-keep_candidate_mode –
retain strategy for the generated
-$caption_num$ candidates.
-’random_any’: Retain the random one from generated captions
-
-’similar_one_simhash’: Retain the generated one that is most similar to the original caption
-
-
-’all’: Retain all generated captions by concatenation
-
-
-
-
-
-
Note
-
This is a batched_OP, whose input and output type are
-both list. Suppose there are $N$ list of input samples, whose batch
-size is $b$, and denote caption_num as $M$.
-The number of total samples after generation is $2Nb$ when
-keep_original_sample is True and $Nb$ when keep_original_sample is
-False. For ‘random_any’ and ‘similar_one_simhash’ mode,
-it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True
-and $MNb$ when keep_original_sample is False.
-
-
-Parameters:
-
-keep_original_sample – whether to keep the original sample. If
-it’s set to False, there will be only generated captions in the
-final datasets and the original captions will be removed. It’s True
-in default.
-prompt – a string prompt to guide the generation of blip2 model
-for all samples globally. It’s None in default, which means no
-prompt provided.
-prompt_key – the key name of fields in samples to store prompts
-for each sample. It’s used for set different prompts for different
-samples. If it’s none, use prompt in parameter “prompt”. It’s None
-in default.
+pattern – regular expression pattern(s) to search for within text
+repl – replacement string(s), default is empty string
args – extra args
kwargs – extra args
@@ -1438,44 +1500,56 @@
-
-process_batched ( samples , rank = None ) [source]
-
-
Note
-
This is a batched_OP, whose input and output type are
-both list. Suppose there are $N$ input sample list with batch
-size as $b$, and denote caption_num as $M$.
-the number of total samples after generation is $2Nb$
-for ‘random_any’ and ‘similar_one’ mode,
-and $(1+M)Nb$ for ‘all’ mode.
-
+
+process_batched ( samples ) [source]
+
+
+
+
+
+
+class data_juicer.ops.mapper. SentenceSplitMapper ( lang : str = 'en' , * args , ** kwargs ) [source]
+Bases: Mapper
+Mapper to split text samples to sentences.
+
+
+__init__ ( lang : str = 'en' , * args , ** kwargs ) [source]
+Initialization method.
Parameters:
-samples –
-
-Returns:
-
+
+
+
+process_batched ( samples ) [source]
+
+
-
-class data_juicer.ops.mapper. RemoveWordsWithIncorrectSubstringsMapper ( lang : str = 'en' , tokenization : bool = False , substrings : List [ str ] | None = None , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoCaptioningFromAudioMapper ( keep_original_sample : bool = True , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to remove words with incorrect substrings.
+Mapper to caption a video according to its audio streams based on
+Qwen-Audio model.
-
-__init__ ( lang : str = 'en' , tokenization : bool = False , substrings : List [ str ] | None = None , * args , ** kwargs ) [source]
+
+__init__ ( keep_original_sample : bool = True , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-lang – sample in which language
-tokenization – whether to use model to tokenize documents
-substrings – The incorrect substrings in words.
+keep_original_sample – whether to keep the original sample. If
+it’s set to False, there will be only captioned sample in the
+final datasets and the original sample will be removed. It’s True
+in default.
args – extra args
kwargs – extra args
@@ -1484,32 +1558,27 @@
-
-should_keep_word_with_incorrect_substrings ( word , substrings ) [source]
-
-
-
-
-process_batched ( samples ) [source]
+
+process_batched ( samples , rank = None ) [source]
-
-class data_juicer.ops.mapper. VideoCaptioningFromVideoMapper ( hf_video_blip : str = 'kpyu/video-blip-opt-2.7b-ego4d' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoCaptioningFromFramesMapper ( hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , * args , ** kwargs ) [source]
Bases: Mapper
Mapper to generate samples whose captions are generated based on
-a video-to-text model and sampled video frame.
+an image-to-text model and sampled video frames. Captions from different
+frames will be concatenated to a single string.
-
-__init__ ( hf_video_blip : str = 'kpyu/video-blip-opt-2.7b-ego4d' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , * args , ** kwargs ) [source]
+
+__init__ ( hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_video_blip – video-blip model name on huggingface
-to generate caption
+hf_img2seq – model name on huggingface to generate caption
caption_num – how many candidate captions to generate
for each video
keep_candidate_mode –
retain strategy for the generated
@@ -1542,7 +1611,7 @@
it’s set to False, there will be only generated captions in the
final datasets and the original captions will be removed. It’s True
in default.
-prompt – a string prompt to guide the generation of video-blip
+
prompt – a string prompt to guide the generation of image-to-text
model for all samples globally. It’s None in default, which means
no prompt provided.
prompt_key – the key name of fields in samples to store prompts
@@ -1572,8 +1641,8 @@
-
-process_batched ( samples , rank = None , context = False ) [source]
+
+process_batched ( samples , rank = None , context = False ) [source]
Parameters:
samples –
@@ -1583,316 +1652,67 @@
-
Note
-
This is a batched_OP, whose the input and output type are
-both list. Suppose there are $N$ input sample list with batch
-size as $b$, and denote caption_num as $M$.
-the number of total samples after generation is $2Nb$
-for ‘random_any’ and ‘similar_one’ mode,
-and $(1+M)Nb$ for ‘all’ mode.
-
-
-
-
-
-
-
-class data_juicer.ops.mapper. VideoCaptioningFromSummarizerMapper ( hf_summarizer : str | None = None , trust_remote_code : bool = False , consider_video_caption_from_video : bool = True , consider_video_caption_from_audio : bool = True , consider_video_caption_from_frames : bool = True , consider_video_tags_from_audio : bool = True , consider_video_tags_from_frames : bool = True , vid_cap_from_vid_args : Dict | None = None , vid_cap_from_frm_args : Dict | None = None , vid_tag_from_aud_args : Dict | None = None , vid_tag_from_frm_args : Dict | None = None , keep_tag_num : int [ int ] = 5 , keep_original_sample : bool = True , * args , ** kwargs ) [source]
-Bases: Mapper
-Mapper to generate video captions by summarizing several kinds of generated
-texts (captions from video/audio/frames, tags from audio/frames, …)
-
-
-__init__ ( hf_summarizer : str | None = None , trust_remote_code : bool = False , consider_video_caption_from_video : bool = True , consider_video_caption_from_audio : bool = True , consider_video_caption_from_frames : bool = True , consider_video_tags_from_audio : bool = True , consider_video_tags_from_frames : bool = True , vid_cap_from_vid_args : Dict | None = None , vid_cap_from_frm_args : Dict | None = None , vid_tag_from_aud_args : Dict | None = None , vid_tag_from_frm_args : Dict | None = None , keep_tag_num : int [ int ] = 5 , keep_original_sample : bool = True , * args , ** kwargs ) [source]
-Initialization method.
-
-Parameters:
-
-hf_summarizer – the summarizer model used to summarize texts
-generated by other methods.
-consider_video_caption_from_video – whether to consider the video
-caption generated from video directly in the summarization process.
-Default: True.
-consider_video_caption_from_audio – whether to consider the video
-caption generated from audio streams in the video in the
-summarization process. Default: True.
-consider_video_caption_from_frames – whether to consider the
-video caption generated from sampled frames from the video in the
-summarization process. Default: True.
-consider_video_tags_from_audio – whether to consider the video
-tags generated from audio streams in the video in the summarization
-process. Default: True.
-consider_video_tags_from_frames – whether to consider the video
-tags generated from sampled frames from the video in the
-summarization process. Default: True.
-vid_cap_from_vid_args – the arg dict for video captioning from
-video directly with keys are the arg names and values are the arg
-values. Default: None.
-vid_cap_from_frm_args – the arg dict for video captioning from
-sampled frames from the video with keys are the arg names and
-values are the arg values. Default: None.
-vid_tag_from_aud_args – the arg dict for video tagging from audio
-streams in the video with keys are the arg names and values are the
-arg values. Default: None.
-vid_tag_from_frm_args – the arg dict for video tagging from
-sampled frames from the video with keys are the arg names and
-values are the arg values. Default: None.
-keep_tag_num – max number N of tags from sampled frames to keep.
-Too many tags might bring negative influence to summarized text, so
-we consider to only keep the N most frequent tags. Default: 5.
-keep_original_sample – whether to keep the original sample. If
-it’s set to False, there will be only summarized captions in the
-final datasets and the original captions will be removed. It’s True
-in default.
-args – extra args
-kwargs – extra args
-
-
-
-
-
-
-
-process_batched ( samples , rank = None ) [source]
-
-
-
-
-
-
-class data_juicer.ops.mapper. GenerateInstructionMapper ( hf_model : str = 'Qwen/Qwen-7B-Chat' , seed_file : str = '' , instruct_num : int [ int ] = 3 , trust_remote_code : bool = False , similarity_threshold : float = 0.7 , prompt_template : str | None = None , qa_pair_template : str | None = None , example_template : str | None = None , qa_extraction_pattern : str | None = None , enable_vllm : bool = True , tensor_parallel_size : int | None = None , max_model_len : int | None = None , max_num_seqs : int = 256 , sampling_params : Dict = {} , * args , ** kwargs ) [source]
-Bases: Mapper
-Mapper to generate new instruction text data.
-You should configure an empty dataset in your yaml config file:
-`` `
-generated_dataset_config:
-
-type: ‘EmptyFormatter’ # use RayEmptyFormatter when enable ray
-length: ${The number of generated samples}
-feature_keys: ${text key}
-
-`` `
-The number of samples generated is determined by
-the length of the empty dataset.
-
-
-__init__ ( hf_model : str = 'Qwen/Qwen-7B-Chat' , seed_file : str = '' , instruct_num : int [ int ] = 3 , trust_remote_code : bool = False , similarity_threshold : float = 0.7 , prompt_template : str | None = None , qa_pair_template : str | None = None , example_template : str | None = None , qa_extraction_pattern : str | None = None , enable_vllm : bool = True , tensor_parallel_size : int | None = None , max_model_len : int | None = None , max_num_seqs : int = 256 , sampling_params : Dict = {} , * args , ** kwargs ) [source]
-
-Initialization method.
-
-param hf_model:
-Hugginface model id.
-
-param seed_file:
-Seed file path, chatml format.
-
-param instruct_num:
-The number of instruction samples.
-Randomly select N samples from “seed_file” and
-put them into prompt as instruction samples.
-
-param trust_remote_code:
-passed to transformers
-
-param similarity_threshold:
-The similarity score threshold
-between the generated samples and the seed samples.
-Range from 0 to 1. Samples with similarity score less than
-this threshold will be kept.
-
-param prompt_template:
-Prompt template for generate samples.
-Please make sure the template contains “{augmented_data}”,
-which corresponds to the augmented samples.
-
-param qa_pair_template:
-Prompt template for generate question
-and answer pair description. Please make sure the template
-contains two “{}” to format question and answer.
-Default: ‘【问题】
-
-
-
-{}
-【回答】
-{}
-‘.
-
-
-param example_template:
-Prompt template for generate examples.
-Please make sure the template contains “{qa_pairs}”, which
-corresponds to the question and answer pair description
-generated by param qa_pair_template .
-Default: ‘
-
-
-
-如下是一条示例数据:
-
-{qa_pairs}’
-param qa_extraction_pattern:
-Regular expression pattern for parsing
-question and answer from model response.
-
-param enable_vllm:
-Whether to use vllm for inference acceleration.
-
-param tensor_parallel_size:
-It is only valid when enable_vllm is True.
-The number of GPUs to use for distributed execution with tensor
-parallelism.
-
-param max_model_len:
-It is only valid when enable_vllm is True.
-Model context length. If unspecified, will be automatically
-derived from the model config.
-
-param max_num_seqs:
-It is only valid when enable_vllm is True.
-Maximum number of sequences to be processed in a single iteration.
-
-param sampling_params:
-Sampling parameters for text generation.
-e.g {‘temperature’: 0.9, ‘top_p’: 0.95}
-
-param args:
-extra args
-
-param kwargs:
-extra args
-
-
-
-
-
-
-
-
-load_seed_qa_samples ( seed_file ) [source]
-Load QA pairs from chatml format file.
-
-
-
-
-build_prompt ( qa_samples , prompt_template ) [source]
-
-
-
-
-parse_chatml_str ( input_str ) [source]
-
-
-
-
-parse_response ( response_str ) [source]
-
-
-
-
-max_rouge_l_score ( reference , candidates ) [source]
-
-
-
-
-process_single ( sample = None , rank = None ) [source]
-For sample level, sample –> sample
-
-Parameters:
-sample – sample to process
-
-Returns:
-processed sample
-
-
-
-
-
-
-
-
-class data_juicer.ops.mapper. FixUnicodeMapper ( normalization : str | None = None , * args , ** kwargs ) [source]
-Bases: Mapper
-Mapper to fix unicode errors in text samples.
-
-
-__init__ ( normalization : str | None = None , * args , ** kwargs ) [source]
-Initialization method.
-
-Parameters:
-
-normalization – the specified form of Unicode
-normalization mode, which can be one of
-[‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’], default ‘NFC’.
-args – extra args
-kwargs – extra args
-
-
-
+Note
+This is a batched_OP, whose the input and output type are
+both list. Suppose there are $N$ input sample list with batch
+size as $b$, and denote caption_num as $M$.
+the number of total samples after generation is $2Nb$
+for ‘random_any’ and ‘similar_one’ mode,
+and $(1+M)Nb$ for ‘all’ mode.
+
-
-
-process_batched ( samples ) [source]
-
-
-
-class data_juicer.ops.mapper. NlpaugEnMapper ( sequential : bool = False , aug_num : int [ int ] = 1 , keep_original_sample : bool = True , delete_random_word : bool = False , swap_random_word : bool = False , spelling_error_word : bool = False , split_random_word : bool = False , keyboard_error_char : bool = False , ocr_error_char : bool = False , delete_random_char : bool = False , swap_random_char : bool = False , insert_random_char : bool = False , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoCaptioningFromSummarizerMapper ( hf_summarizer : str | None = None , trust_remote_code : bool = False , consider_video_caption_from_video : bool = True , consider_video_caption_from_audio : bool = True , consider_video_caption_from_frames : bool = True , consider_video_tags_from_audio : bool = True , consider_video_tags_from_frames : bool = True , vid_cap_from_vid_args : Dict | None = None , vid_cap_from_frm_args : Dict | None = None , vid_tag_from_aud_args : Dict | None = None , vid_tag_from_frm_args : Dict | None = None , keep_tag_num : int [ int ] = 5 , keep_original_sample : bool = True , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to simply augment samples in English based on nlpaug library.
+Mapper to generate video captions by summarizing several kinds of generated
+texts (captions from video/audio/frames, tags from audio/frames, …)
-
-__init__ ( sequential : bool = False , aug_num : int [ int ] = 1 , keep_original_sample : bool = True , delete_random_word : bool = False , swap_random_word : bool = False , spelling_error_word : bool = False , split_random_word : bool = False , keyboard_error_char : bool = False , ocr_error_char : bool = False , delete_random_char : bool = False , swap_random_char : bool = False , insert_random_char : bool = False , * args , ** kwargs ) [source]
-Initialization method. All augmentation methods use default parameters
-in default. We recommend you to only use 1-3 augmentation methods at a
-time. Otherwise, the semantics of samples might be changed
-significantly.
+
+__init__ ( hf_summarizer : str | None = None , trust_remote_code : bool = False , consider_video_caption_from_video : bool = True , consider_video_caption_from_audio : bool = True , consider_video_caption_from_frames : bool = True , consider_video_tags_from_audio : bool = True , consider_video_tags_from_frames : bool = True , vid_cap_from_vid_args : Dict | None = None , vid_cap_from_frm_args : Dict | None = None , vid_tag_from_aud_args : Dict | None = None , vid_tag_from_frm_args : Dict | None = None , keep_tag_num : int [ int ] = 5 , keep_original_sample : bool = True , * args , ** kwargs ) [source]
+Initialization method.
Parameters:
-sequential – whether combine all augmentation methods to a
-sequence. If it’s True, a sample will be augmented by all opened
-augmentation methods sequentially. If it’s False, each opened
-augmentation method would generate its augmented samples
-independently.
-aug_num – number of augmented samples to be generated. If
-sequential is True, there will be total aug_num augmented samples
-generated. If it’s False, there will be (aug_num *
-#opened_aug_method) augmented samples generated.
+hf_summarizer – the summarizer model used to summarize texts
+generated by other methods.
+consider_video_caption_from_video – whether to consider the video
+caption generated from video directly in the summarization process.
+Default: True.
+consider_video_caption_from_audio – whether to consider the video
+caption generated from audio streams in the video in the
+summarization process. Default: True.
+consider_video_caption_from_frames – whether to consider the
+video caption generated from sampled frames from the video in the
+summarization process. Default: True.
+consider_video_tags_from_audio – whether to consider the video
+tags generated from audio streams in the video in the summarization
+process. Default: True.
+consider_video_tags_from_frames – whether to consider the video
+tags generated from sampled frames from the video in the
+summarization process. Default: True.
+vid_cap_from_vid_args – the arg dict for video captioning from
+video directly with keys are the arg names and values are the arg
+values. Default: None.
+vid_cap_from_frm_args – the arg dict for video captioning from
+sampled frames from the video with keys are the arg names and
+values are the arg values. Default: None.
+vid_tag_from_aud_args – the arg dict for video tagging from audio
+streams in the video with keys are the arg names and values are the
+arg values. Default: None.
+vid_tag_from_frm_args – the arg dict for video tagging from
+sampled frames from the video with keys are the arg names and
+values are the arg values. Default: None.
+keep_tag_num – max number N of tags from sampled frames to keep.
+Too many tags might bring negative influence to summarized text, so
+we consider to only keep the N most frequent tags. Default: 5.
keep_original_sample – whether to keep the original sample. If
-it’s set to False, there will be only generated texts in the final
-datasets and the original texts will be removed. It’s True in
-default.
-delete_random_word – whether to open the augmentation method of
-deleting random words from the original texts. e.g. “I love LLM”
-–> “I LLM”
-swap_random_word – whether to open the augmentation method of
-swapping random contiguous words in the original texts. e.g. “I
-love LLM” –> “Love I LLM”
-spelling_error_word – whether to open the augmentation method of
-simulating the spelling error for words in the original texts. e.g.
-“I love LLM” –> “Ai love LLM”
-split_random_word – whether to open the augmentation method of
-splitting words randomly with whitespaces in the original texts.
-e.g. “I love LLM” –> “I love LL M”
-keyboard_error_char – whether to open the augmentation method of
-simulating the keyboard error for characters in the original texts.
-e.g. “I love LLM” –> “I ;ov4 LLM”
-ocr_error_char – whether to open the augmentation method of
-simulating the OCR error for characters in the original texts.
-e.g. “I love LLM” –> “I 10ve LLM”
-delete_random_char – whether to open the augmentation method of
-deleting random characters from the original texts. e.g. “I love
-LLM” –> “I oe LLM”
-swap_random_char – whether to open the augmentation method of
-swapping random contiguous characters in the original texts.
-e.g. “I love LLM” –> “I ovle LLM”
-insert_random_char – whether to open the augmentation method of
-inserting random characters into the original texts. e.g. “I love
-LLM” –> “I ^lKove LLM”
+it’s set to False, there will be only summarized captions in the
+final datasets and the original captions will be removed. It’s True
+in default.
args – extra args
kwargs – extra args
@@ -1901,27 +1721,27 @@
-
-process_batched ( samples ) [source]
+
+process_batched ( samples , rank = None ) [source]
-
-class data_juicer.ops.mapper. VideoCaptioningFromFramesMapper ( hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoCaptioningFromVideoMapper ( hf_video_blip : str = 'kpyu/video-blip-opt-2.7b-ego4d' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , * args , ** kwargs ) [source]
Bases: Mapper
Mapper to generate samples whose captions are generated based on
-an image-to-text model and sampled video frames. Captions from different
-frames will be concatenated to a single string.
+a video-to-text model and sampled video frame.
-
-__init__ ( hf_img2seq : str = 'Salesforce/blip2-opt-2.7b' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , * args , ** kwargs ) [source]
+
+__init__ ( hf_video_blip : str = 'kpyu/video-blip-opt-2.7b-ego4d' , trust_remote_code : bool = False , caption_num : int [ int ] = 1 , keep_candidate_mode : str = 'random_any' , keep_original_sample : bool = True , prompt : str | None = None , prompt_key : str | None = None , frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , horizontal_flip : bool = False , vertical_flip : bool = False , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-hf_img2seq – model name on huggingface to generate caption
+hf_video_blip – video-blip model name on huggingface
+to generate caption
caption_num – how many candidate captions to generate
for each video
keep_candidate_mode –
retain strategy for the generated
@@ -1954,7 +1774,7 @@
it’s set to False, there will be only generated captions in the
final datasets and the original captions will be removed. It’s True
in default.
-prompt – a string prompt to guide the generation of image-to-text
+
prompt – a string prompt to guide the generation of video-blip
model for all samples globally. It’s None in default, which means
no prompt provided.
prompt_key – the key name of fields in samples to store prompts
@@ -1984,8 +1804,8 @@
-
-process_batched ( samples , rank = None , context = False ) [source]
+
+process_batched ( samples , rank = None , context = False ) [source]
Parameters:
samples –
@@ -2008,21 +1828,172 @@
-
-class data_juicer.ops.mapper. RemoveLongWordsMapper ( min_len : int = 1 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoFFmpegWrappedMapper ( filter_name : str | None = None , filter_kwargs : Dict | None = None , global_args : List [ str ] | None = None , capture_stderr : bool = True , overwrite_output : bool = True , * args , ** kwargs ) [source]
+Bases: Mapper
+Simple wrapper for FFmpeg video filters.
+
+
+__init__ ( filter_name : str | None = None , filter_kwargs : Dict | None = None , global_args : List [ str ] | None = None , capture_stderr : bool = True , overwrite_output : bool = True , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+filter_name – ffmpeg video filter name.
+filter_kwargs – keyword-arguments passed to ffmpeg filter.
+global_args – list-arguments passed to ffmpeg command-line.
+capture_stderr – whether to capture stderr.
+overwrite_output – whether to overwrite output file.
+args – extra args
+kwargs – extra args
+
+
+
+
+
+
+
+process_single ( sample ) [source]
+For sample level, sample –> sample
+
+Parameters:
+sample – sample to process
+
+Returns:
+processed sample
+
+
+
+
+
+
+
+
+class data_juicer.ops.mapper. VideoFaceBlurMapper ( cv_classifier : str = '' , blur_type : str = 'gaussian' , radius : float = 2 , * args , ** kwargs ) [source]
+Bases: Mapper
+Mapper to blur faces detected in videos.
+
+
+__init__ ( cv_classifier : str = '' , blur_type : str = 'gaussian' , radius : float = 2 , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+cv_classifier – OpenCV classifier path for face detection.
+By default, we will use ‘haarcascade_frontalface_alt.xml’.
+blur_type – Type of blur kernel, including
+[‘mean’, ‘box’, ‘gaussian’].
+radius – Radius of blur kernel.
+args – extra args
+kwargs – extra args
+
+
+
+
+
+
+
+process_single ( sample , context = False ) [source]
+For sample level, sample –> sample
+
+Parameters:
+sample – sample to process
+
+Returns:
+processed sample
+
+
+
+
+
+
+
+
+class data_juicer.ops.mapper. VideoRemoveWatermarkMapper ( roi_strings : List [ str ] = ['0,0,0.1,0.1'] , roi_type : str = 'ratio' , roi_key : str | None = None , frame_num : int [ int ] = 10 , min_frame_threshold : int [ int ] = 7 , detection_method : str = 'pixel_value' , * args , ** kwargs ) [source]
+Bases: Mapper
+Remove the watermarks in videos given regions.
+
+
+__init__ ( roi_strings : List [ str ] = ['0,0,0.1,0.1'] , roi_type : str = 'ratio' , roi_key : str | None = None , frame_num : int [ int ] = 10 , min_frame_threshold : int [ int ] = 7 , detection_method : str = 'pixel_value' , * args , ** kwargs ) [source]
+Initialization method.
+
+Parameters:
+
+roi_strings – a given list of regions the watermarks locate.
+The format of each can be “x1, y1, x2, y2”, “(x1, y1, x2, y2)”,
+or “[x1, y1, x2, y2]”.
+roi_type – the roi string type. When the type is ‘pixel’, (x1,
+y1), (x2, y2) are the locations of pixels in the top left corner
+and the bottom right corner respectively. If the roi_type is
+‘ratio’, the coordinates are normalized by wights and heights.
+roi_key – the key name of fields in samples to store roi_strings
+for each sample. It’s used for set different rois for different
+samples. If it’s none, use rois in parameter “roi_strings”.
+It’s None in default.
+frame_num – the number of frames to be extracted uniformly from
+the video to detect the pixels of watermark.
+min_frame_threshold – a coodination is considered as the
+location of a watermark pixel when it is that in no less
+min_frame_threshold frames.
+detection_method – the method to detect the pixels of watermark.
+If it is ‘pixel_value’, we consider the distribution of pixel
+value in each frame. If it is ‘pixel_diversity’, we will consider
+the pixel diversity in different frames. The min_frame_threshold
+is useless and frame_num must be greater than 1 in
+‘pixel_diversity’ mode.
+args – extra args
+kwargs – extra args
+
+
+
+
+
+
+
+process_single ( sample , context = False ) [source]
+For sample level, sample –> sample
+
+Parameters:
+sample – sample to process
+
+Returns:
+processed sample
+
+
+
+
+
+
+
+
+class data_juicer.ops.mapper. VideoResizeAspectRatioMapper ( min_ratio : str = '9/21' , max_ratio : str = '21/9' , strategy : str = 'increase' , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to remove long words within a specific range.
+Mapper to resize videos by aspect ratio.
+AspectRatio = W / H.
+
+
+STRATEGY = ['decrease', 'increase']
+
+
-
-__init__ ( min_len : int = 1 , max_len : int = 9223372036854775807 , * args , ** kwargs ) [source]
+
+__init__ ( min_ratio : str = '9/21' , max_ratio : str = '21/9' , strategy : str = 'increase' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-min_len – The min mapper word length in this op, words
-will be filtered if their length is below this parameter.
-max_len – The max mapper word length in this op, words
-will be filtered if their length exceeds this parameter.
+min_ratio – The minimum aspect ratio to enforce videos with
+an aspect ratio below min_ratio will be resized to match
+this minimum ratio. The ratio should be provided as a string
+in the format “9:21” or “9/21”.
+max_ratio – The maximum aspect ratio to enforce videos with
+an aspect ratio above max_ratio will be resized to match
+this maximum ratio. The ratio should be provided as a string
+in the format “21:9” or “21/9”.
+strategy – The resizing strategy to apply when adjusting the
+video dimensions. It can be either ‘decrease’ to reduce the
+dimension or ‘increase’ to enlarge it. Accepted values are
+[‘decrease’, ‘increase’].
args – extra args
kwargs – extra args
@@ -2031,14 +2002,18 @@
-
-should_keep_long_word ( word ) [source]
-
-
-
-
-process_batched ( samples ) [source]
-
+
+process_single ( sample ) [source]
+For sample level, sample –> sample
+
+Parameters:
+sample – sample to process
+
+Returns:
+processed sample
+
+
+
@@ -2089,19 +2064,25 @@
-
-class data_juicer.ops.mapper. CleanEmailMapper ( pattern : str | None = None , repl : str = '' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoSplitByDurationMapper ( split_duration : float = 10 , min_last_split_duration : float = 0 , keep_original_sample : bool = True , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to clean email in text samples.
+Mapper to split video by duration.
-
-__init__ ( pattern : str | None = None , repl : str = '' , * args , ** kwargs ) [source]
+
+__init__ ( split_duration : float = 10 , min_last_split_duration : float = 0 , keep_original_sample : bool = True , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-pattern – regular expression pattern to search for within text.
-repl – replacement string, default is empty string.
+split_duration – duration of each video split in seconds.
+min_last_split_duration – The minimum allowable duration in
+seconds for the last video split. If the duration of the last
+split is less than this value, it will be discarded.
+keep_original_sample – whether to keep the original sample. If
+it’s set to False, there will be only cut sample in the
+final datasets and the original sample will be removed. It’s True
+in default.
args – extra args
kwargs – extra args
@@ -2110,28 +2091,33 @@
-
-process_batched ( samples ) [source]
+
+split_videos_by_duration ( video_key , container ) [source]
+
+
+
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. ReplaceContentMapper ( pattern : str | List [ str ] | None = None , repl : str | List [ str ] = '' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoSplitByKeyFrameMapper ( keep_original_sample : bool = True , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to replace all content in the text that matches
-a specific regular expression pattern with a designated
-replacement string.
+Mapper to split video by key frame.
-
-__init__ ( pattern : str | List [ str ] | None = None , repl : str | List [ str ] = '' , * args , ** kwargs ) [source]
+
+__init__ ( keep_original_sample : bool = True , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-pattern – regular expression pattern(s) to search for within text
-repl – replacement string(s), default is empty string
+keep_original_sample – whether to keep the original sample. If
+it’s set to False, there will be only split sample in the
+final datasets and the original sample will be removed. It’s True
+in default.
args – extra args
kwargs – extra args
@@ -2140,29 +2126,39 @@
-
-process_batched ( samples ) [source]
+
+get_split_key_frame ( video_key , container ) [source]
+
+
+
+
+process_batched ( samples ) [source]
-
-class data_juicer.ops.mapper. AudioFFmpegWrappedMapper ( filter_name : str | None = None , filter_kwargs : Dict | None = None , global_args : List [ str ] | None = None , capture_stderr : bool = True , overwrite_output : bool = True , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoSplitBySceneMapper ( detector : str = 'ContentDetector' , threshold : float [ float ] = 27.0 , min_scene_len : int [ int ] = 15 , show_progress : bool = False , * args , ** kwargs ) [source]
Bases: Mapper
-Simple wrapper for FFmpeg audio filters.
+Mapper to cut videos into scene clips.
+
+
+avaliable_detectors = {'AdaptiveDetector': ['window_width', 'min_content_val', 'weights', 'luma_only', 'kernel_size', 'video_manager', 'min_delta_hsv'], 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], 'ThresholdDetector': ['fade_bias', 'add_final_scene', 'method', 'block_size']}
+
+
-
-__init__ ( filter_name : str | None = None , filter_kwargs : Dict | None = None , global_args : List [ str ] | None = None , capture_stderr : bool = True , overwrite_output : bool = True , * args , ** kwargs ) [source]
+
+__init__ ( detector : str = 'ContentDetector' , threshold : float [ float ] = 27.0 , min_scene_len : int [ int ] = 15 , show_progress : bool = False , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-filter_name – ffmpeg audio filter name.
-filter_kwargs – keyword-arguments passed to ffmpeg filter.
-global_args – list-arguments passed to ffmpeg command-line.
-capture_stderr – whether to capture stderr.
-overwrite_output – whether to overwrite output file.
+detector – Algorithm from scenedetect.detectors . Should be one
+of [‘ContentDetector’, ‘ThresholdDetector’, ‘AdaptiveDetector`].
+threshold – Threshold passed to the detector.
+min_scene_len – Minimum length of any scene.
+show_progress – Whether to show progress from scenedetect.
args – extra args
kwargs – extra args
@@ -2171,8 +2167,8 @@
-
-process_single ( sample ) [source]
+
+process_single ( sample , context = False ) [source]
For sample level, sample –> sample
Parameters:
@@ -2187,25 +2183,22 @@
-
-class data_juicer.ops.mapper. VideoSplitByDurationMapper ( split_duration : float = 10 , min_last_split_duration : float = 0 , keep_original_sample : bool = True , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoTaggingFromAudioMapper ( hf_ast : str = 'MIT/ast-finetuned-audioset-10-10-0.4593' , trust_remote_code : bool = False , tag_field_name : str = '__dj__video_audio_tags__' , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to split video by duration.
+Mapper to generate video tags from audio streams extracted by video
+using the Audio Spectrogram Transformer.
-
-__init__ ( split_duration : float = 10 , min_last_split_duration : float = 0 , keep_original_sample : bool = True , * args , ** kwargs ) [source]
+
+__init__ ( hf_ast : str = 'MIT/ast-finetuned-audioset-10-10-0.4593' , trust_remote_code : bool = False , tag_field_name : str = '__dj__video_audio_tags__' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-split_duration – duration of each video split in seconds.
-min_last_split_duration – The minimum allowable duration in
-seconds for the last video split. If the duration of the last
-split is less than this value, it will be discarded.
-keep_original_sample – whether to keep the original sample. If
-it’s set to False, there will be only cut sample in the
-final datasets and the original sample will be removed. It’s True
-in default.
+hf_ast – path to the HF model to tag from audios.
+trust_remote_code – whether to trust the remote code of HF models
+tag_field_name – the field name to store the tags. It’s
+“__dj__video_audio_tags__” in default.
args – extra args
kwargs – extra args
@@ -2214,34 +2207,48 @@
-
-split_videos_by_duration ( video_key , container ) [source]
-
-
-
-
-process_batched ( samples ) [source]
-
+
+process_single ( sample , rank = None ) [source]
+For sample level, sample –> sample
+
+Parameters:
+sample – sample to process
+
+Returns:
+processed sample
+
+
+
-
-class data_juicer.ops.mapper. VideoFaceBlurMapper ( cv_classifier : str = '' , blur_type : str = 'gaussian' , radius : float = 2 , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. VideoTaggingFromFramesMapper ( frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , tag_field_name : str = '__dj__video_frame_tags__' , * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to blur faces detected in videos.
+Mapper to generate video tags from frames extract by video.
-
-__init__ ( cv_classifier : str = '' , blur_type : str = 'gaussian' , radius : float = 2 , * args , ** kwargs ) [source]
+
+__init__ ( frame_sampling_method : str = 'all_keyframes' , frame_num : int [ int ] = 3 , tag_field_name : str = '__dj__video_frame_tags__' , * args , ** kwargs ) [source]
Initialization method.
Parameters:
-cv_classifier – OpenCV classifier path for face detection.
-By default, we will use ‘haarcascade_frontalface_alt.xml’.
-blur_type – Type of blur kernel, including
-[‘mean’, ‘box’, ‘gaussian’].
-radius – Radius of blur kernel.
+frame_sampling_method – sampling method of extracting frame
+images from the videos. Should be one of
+[“all_keyframes”, “uniform”].
+The former one extracts all key frames (the number of which depends
+on the duration of the video) and the latter one extract specified
+number of frames uniformly from the video.
+Default: “all_keyframes”.
+frame_num – the number of frames to be extracted uniformly from
+the video. Only works when frame_sampling_method is “uniform”. If
+it’s 1, only the middle frame will be extracted. If it’s 2, only
+the first and the last frames will be extracted. If it’s larger
+than 2, in addition to the first and the last frames, other frames
+will be extracted uniformly within the video duration.
+tag_field_name – the field name to store the tags. It’s
+“__dj__video_frame_tags__” in default.
args – extra args
kwargs – extra args
@@ -2250,8 +2257,8 @@
-
-process_single ( sample , context = False ) [source]
+
+process_single ( sample , rank = None , context = False ) [source]
For sample level, sample –> sample
Parameters:
@@ -2266,18 +2273,17 @@
-
-class data_juicer.ops.mapper. ImageTaggingMapper ( tag_field_name : str = '__dj__image_tags__' , * args , ** kwargs ) [source]
+
+class data_juicer.ops.mapper. WhitespaceNormalizationMapper ( * args , ** kwargs ) [source]
Bases: Mapper
-Mapper to generate image tags.
+Mapper to normalize different kinds of whitespaces to whitespace ‘ ‘ (0x20)
+in text samples.
+Different kinds of whitespaces can be found here:
+https://en.wikipedia.org/wiki/Whitespace_character
-
-__init__ ( tag_field_name : str = '__dj__image_tags__' , * args , ** kwargs ) [source]
-Initialization method.
-:param tag_field_name: the field name to store the tags. It’s
-
-“__dj__image_tags__” in default.
-
+
+__init__ ( * args , ** kwargs ) [source]
+Initialization method.
Parameters:
-
-process_single ( sample , rank = None , context = False ) [source]
-For sample level, sample –> sample
-
-Parameters:
-sample – sample to process
-
-Returns:
-processed sample
-
-
-
+
+process_batched ( samples ) [source]
+
diff --git a/genindex.html b/genindex.html
index 71efd4518..a34a5abdb 100644
--- a/genindex.html
+++ b/genindex.html
@@ -270,12 +270,12 @@ _
(data_juicer.ops.mapper.CleanLinksMapper method)
(data_juicer.ops.mapper.ExpandMacroMapper method)
-
- (data_juicer.ops.mapper.ExtractQAMapper method)
(data_juicer.ops.mapper.FixUnicodeMapper method)
- (data_juicer.ops.mapper.GenerateInstructionMapper method)
+ (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
+
+ (data_juicer.ops.mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.ImageBlurMapper method)
@@ -293,7 +293,7 @@ _
(data_juicer.ops.mapper.NlpcdaZhMapper method)
- (data_juicer.ops.mapper.OptimizeInstructionMapper method)
+ (data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.PunctuationNormalizationMapper method)
@@ -412,8 +412,12 @@ B
@@ -635,8 +639,6 @@ D
module
-
-
data_juicer.ops.mapper
@@ -651,6 +653,8 @@ D
module
+
+
data_juicer.tools
@@ -667,6 +671,36 @@ D
Deduplicator (class in data_juicer.ops)
+ DEFAULT_EXAMPLE_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
+
+ DEFAULT_INPUT_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
+
+
+ DEFAULT_OUTPUT_PATTERN (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
+
+
+ DEFAULT_QA_PAIR_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
+
+
+ DEFAULT_SYSTEM_PROMPT (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute)
+
+
DiversityAnalysis (class in data_juicer.analysis)
DocumentDeduplicator (class in data_juicer.ops.deduplicator)
@@ -694,11 +728,11 @@ E
execute_and_probe() (data_juicer.core.Adapter static method)
Executor (class in data_juicer.core)
-
- ExpandMacroMapper (class in data_juicer.ops.mapper)
@@ -735,14 +767,16 @@ F
G
+ get_sentences_from_document() (in module data_juicer.ops.common)
+
get_split_key_frame() (data_juicer.ops.mapper.VideoSplitByKeyFrameMapper method)
get_words_from_document() (in module data_juicer.ops.common)
@@ -842,8 +876,6 @@ L
load_from_disk() (data_juicer.core.NestedDataset static method)
load_ops() (in module data_juicer.ops)
-
- load_seed_qa_samples() (data_juicer.ops.mapper.GenerateInstructionMapper method)
LocalFormatter (class in data_juicer.format)
@@ -858,8 +890,6 @@ M
Mapper (class in data_juicer.ops)
MAX_BATCH_SIZE (data_juicer.core.Adapter attribute)
-
- max_rouge_l_score() (data_juicer.ops.mapper.GenerateInstructionMapper method)
MaximumLineLengthFilter (class in data_juicer.ops.filter)
@@ -938,10 +968,14 @@ N
O
@@ -952,10 +986,18 @@ P
ParquetFormatter (class in data_juicer.format)
- parse_chatml_str() (data_juicer.ops.mapper.GenerateInstructionMapper method)
+ parse_output() (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
+
+
PerplexityFilter (class in data_juicer.ops.filter)
PhraseGroundingRecallFilter (class in data_juicer.ops.filter)
@@ -1026,6 +1068,8 @@ P
(data_juicer.ops.mapper.ExpandMacroMapper method)
(data_juicer.ops.mapper.FixUnicodeMapper method)
+
+ (data_juicer.ops.mapper.GenerateQAFromTextMapper method)
(data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper method)
@@ -1155,9 +1199,7 @@ P
(data_juicer.ops.mapper.AudioFFmpegWrappedMapper method)
- (data_juicer.ops.mapper.ExtractQAMapper method)
-
- (data_juicer.ops.mapper.GenerateInstructionMapper method)
+ (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method)
(data_juicer.ops.mapper.ImageBlurMapper method)
@@ -1165,7 +1207,7 @@ P
(data_juicer.ops.mapper.ImageTaggingMapper method)
- (data_juicer.ops.mapper.OptimizeInstructionMapper method)
+ (data_juicer.ops.mapper.OptimizeQAMapper method)
(data_juicer.ops.mapper.VideoFaceBlurMapper method)
diff --git a/index.html b/index.html
index 63647e7b5..3eaf19f33 100644
--- a/index.html
+++ b/index.html
@@ -104,111 +104,113 @@ data_juicer.ops.mapper
data_juicer.ops.deduplicator
data_juicer.ops.selector
diff --git a/objects.inv b/objects.inv
index 3ef7dcac8..f6f555efe 100644
Binary files a/objects.inv and b/objects.inv differ
diff --git a/searchindex.js b/searchindex.js
index 0a6106f31..3c211b572 100644
--- a/searchindex.js
+++ b/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7, 8, 9], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 3, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 3, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 3, 7, 8, 9], "inform": [1, 3, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 8, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": [2, 9], "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": [2, 3], "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 3, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 3, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": [2, 3, 4, 9], "origin": [2, 3, 8, 9], "expect": [2, 3, 9], "cfg_after_merg": 2, "adapt": [3, 13], "max_batch_s": 3, "10000": 3, "static": 3, "execute_and_prob": 3, "oper": 3, "sample_interv": 3, "0": [3, 4, 5, 7, 8, 9], "5": [3, 7, 8, 9], "input": [3, 5, 7, 8, 9, 10], "probe": 3, "relat": [3, 8], "op": [3, 13], "specifi": [3, 4, 6, 8, 9, 10], "For": [3, 5, 7, 8, 9], "now": [3, 6, 9], "we": [3, 4, 7, 8, 9, 13], "support": [3, 8, 9], "follow": [3, 9], "target": [3, 8, 10], "resourc": 3, "util": 3, "speed": 3, "averag": [3, 8], "The": [3, 4, 5, 8, 9, 10], "item": [3, 5], "take_batch": 3, "config": [3, 5, 9, 13], "split": [3, 6, 9], "batch": [3, 9], "factor": 3, "set": [3, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "iter": [3, 8, 9], "adapt_workload": 3, "manag": 3, "schedul": 3, "balanc": 3, "need": [3, 6, 8, 9, 10], "recip": 3, "probe_small_batch": 3, "perform": 3, "small": [3, 8], "pre": 3, "execut": [3, 9], "avail": [3, 8], "current": 3, "estim": 3, "rank": [3, 8, 9, 10], "A": [3, 5, 7, 9], "length": [3, 4, 8, 9], "batch_size_strategi": 3, "load_analysis_r": 3, "base_b": 3, "util_th": 3, "9": [3, 8, 9], "decid": [3, 5, 7, 8], "accord": [3, 4, 5, 8, 9], "workload": 3, "analysi": [3, 13], "threshold": [3, 7, 8, 9], "guarante": 3, "won": [3, 7], "t": [3, 4, 6, 7], "exce": [3, 8, 9], "onli": [3, 7, 8, 9], "consid": [3, 7, 8, 9], "bucket": 3, "effect": 3, "which": [3, 5, 7, 8, 9], "max": [3, 4, 7, 8, 9], "except": [3, 9], "gpu": [3, 9], "thi": [3, 4, 5, 6, 7, 8, 9, 10], "It": [3, 4, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "work_dir": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 9, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "load_from_disk": 3, "wa": 3, "previous": 3, "save_to_disk": 3, "directori": [3, 4, 8], "filesystem": 3, "ani": [3, 8, 9], "implement": [3, 7], "fsspec": 3, "spec": 3, "abstractfilesystem": 3, "dataset_path": [3, 4], "str": [3, 4, 6, 7, 8, 9, 10], "train": [3, 9], "remot": [3, 9], "uri": 3, "s3": 3, "my": 3, "where": 3, "f": [3, 4], "instanc": [3, 5], "deprec": 3, "version": [3, 9], "2": [3, 6, 8, 9], "8": [3, 8, 9], "3": [3, 8, 9], "pleas": [3, 7, 9], "storage_opt": 3, "instead": [3, 4, 6], "keep_in_memori": 3, "bool": [3, 7, 8, 9, 10], "copi": 3, "memori": 3, "unless": 3, "explicitli": 3, "enabl": [3, 9], "in_memory_max_s": 3, "nonzero": 3, "see": [3, 13], "more": [3, 8, 9, 13], "detail": [3, 13], "improv": 3, "section": 3, "kei": [3, 4, 5, 8, 9, 10], "pair": [3, 5, 7, 8, 9], "pass": [3, 9], "system": [3, 9], "backend": 3, "ad": [3, 6, 9], "request": [3, 9], "datasetdict": 3, "exampl": [3, 8, 9], "py": [3, 4], "d": [3, 4], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 7, 8, 9, 10], "sample_algo": 3, "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "ratio": [3, 4, 6, 8, 9, 10], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "to_jsonl": 3, "jsonl": [3, 4], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "monitor": [3, 13], "other": [3, 8, 9], "dure": 3, "python": 3, "time": [3, 9], "10": [3, 8, 9], "timestamp": 3, "xxx": 3, "cpu": 3, "count": [3, 8], "free": 3, "mem": 3, "structur": 3, "abov": [3, 9], "field": [3, 4, 5, 7, 8, 9, 10], "first": [3, 6, 7, 8, 9], "level": [3, 5, 6, 7, 8, 9, 10], "resource_analysi": 3, "min": [3, 7, 8, 9], "avg": [3, 8], "those": [3, 8], "dynamic_field": 3, "monitor_all_resourc": 3, "detect": [3, 7, 8, 9], "node": 3, "monitor_current_resourc": 3, "machin": 3, "rang": [3, 8, 9, 10], "mb": [3, 8], "analyze_resource_util_list": 3, "resource_util_list": 3, "metric": [3, 5, 7, 8], "analyze_single_resource_util": 3, "resource_util_dict": 3, "monitor_func": 3, "show_num": [3, 5, 7], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "two": [3, 7, 8, 9], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "generated_dataset_config": [4, 9], "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "creat": 4, "provid": [4, 7, 9], "must": [4, 8, 9], "contain": [4, 6, 8, 9], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 7, 8, 9, 10], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": [4, 8, 9], "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8, 9], "datasset": 4, "dir": 4, "w1": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "random_sampl": 4, "sample_numb": 4, "seed": [4, 9], "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "random": [4, 9, 10], "42": 4, "emptyformatt": [4, 9, 13], "feature_kei": [4, 9], "empti": [4, 7, 9], "featur": 4, "properti": 4, "null_valu": 4, "rayemptyformatt": [4, 9, 13], "rai": [4, 7, 9], "load_op": [5, 13], "process_list": 5, "op_fus": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stats_batch": [5, 8], "process_batch": [5, 8, 9], "compute_stats_singl": [5, 7, 8], "context": [5, 7, 8, 9], "var": [5, 7, 8], "temporarili": [5, 7, 8], "process_singl": [5, 7, 8, 9], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": [6, 9], "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "videodedupl": [7, 13], "consider_text": 7, "exact": 7, "match": [7, 8, 9], "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "6380": 7, "basic": 7, "although": 7, "empty_hash_valu": 7, "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "documentminhashdedupl": [7, 13], "window_s": 7, "ignore_pattern": 7, "num_permut": 7, "256": [7, 9], "jaccard_threshold": 7, "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "kept": [7, 8, 9], "final": [7, 9], "should": [7, 8, 9], "punctuat": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "shingl": 7, "ignor": [7, 9], "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "jaccard": 7, "similar": [7, 8, 9], "regard": 7, "band": 7, "lsh": 7, "determin": [7, 9, 10], "optim": [7, 9], "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "rayimagededupl": [7, 13], "phash": 7, "raydocumentdedupl": [7, 13], "ignore_non_charact": 7, "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentdedupl": [7, 13], "md5": 7, "imagededupl": [7, 13], "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "rayvideodedupl": [7, 13], "imagetextsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "trust_remote_cod": [8, 9], "min_scor": 8, "max_scor": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "any_or_al": [8, 9], "reduce_mod": 8, "within": [8, 9, 10], "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 9, 10], "chunk": 8, "take": 8, "videoaspectratiofilt": [8, 13], "min_ratio": [8, 9], "21": [8, 9], "max_ratio": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "minimum": [8, 9], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "score": [8, 9], "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "9223372036854775807": [8, 9], "total": [8, 9], "hug": 8, "face": [8, 9], "below": [8, 9], "textlengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "specifiednumericfieldfilt": [8, 13], "field_kei": [8, 10], "min_valu": 8, "max_valu": 8, "numer": 8, "multi": [8, 10, 13], "specifiednumericfield": 8, "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "sy": 8, "maxsiz": 8, "videoaestheticsfilt": [8, 13], "hf_scorer_model": 8, "frame_sampling_method": [8, 9], "frame_num": [8, 9], "aesthet": 8, "frame": [8, 9], "predictor": 8, "By": [8, 9], "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "larg": 8, "while": 8, "usual": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "larger": [8, 9, 10], "addit": [8, 9], "durat": [8, 9], "keyword": [8, 9], "perplexityfilt": [8, 13], "lang": [8, 9], "max_ppl": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": [8, 9], "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "confid": 8, "area": 8, "out": 8, "account": 8, "maximumlinelengthfilt": [8, 13], "averagelinelengthfilt": [8, 13], "specifiedfieldfilt": [8, 13], "target_valu": 8, "retain": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "tag_field_nam": [8, 9], "__dj__video_frame_tags__": [8, 9], "shift": 8, "found": [8, 9], "http": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "requir": 8, "equal": [8, 9, 10], "depend": [8, 9], "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "zh": 8, "mini_dependency_num": 8, "edg": 8, "objet": 8, "videoresolutionfilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "resolut": [8, 9], "alphanumericfilt": [8, 13], "25": 8, "alphanumer": 8, "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "watermark": [8, 9], "high": 8, "probabl": [8, 9], "imageaestheticsfilt": [8, 13], "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "home": 8, "runner": 8, "asset": 8, "what": 8, "adopt": 8, "join": 8, "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "imageshapefilt": [8, 13], "shape": 8, "width": [8, 9], "height": [8, 9], "videodurationfilt": [8, 13], "min_dur": 8, "max_dur": 8, "second": [8, 9], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "mini_action_num": 8, "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "here": [8, 9, 13], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videonsfwfilt": [8, 13], "specialcharactersfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "chineseclip": 8, "might": [8, 9], "choic": 8, "imageaspectratiofilt": [8, 13], "333": 8, "audiodurationfilt": [8, 13], "languageidscorefilt": [8, 13], "identif": 8, "suffixfilt": [8, 13], "imagesizefilt": [8, 13], "videowatermarkfilt": [8, 13], "wordsnumfilt": [8, 13], "imagefacecountfilt": [8, 13], "cv_classifi": [8, 9], "min_face_count": 8, "max_face_count": 8, "opencv": [8, 9], "classifi": [8, 9], "haarcascade_frontalface_alt": [8, 9], "imagefaceratiofilt": [8, 13], "largest": [8, 10], "flaggedwordfilt": [8, 13], "045": 8, "flagged_words_dir": 8, "flag": 8, "flagged_word": 8, "wordrepetitionfilt": [8, 13], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "tupl": 8, "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "sequenc": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "imagepairsimilarityfilt": [8, 13], "closedunitinterv": 8, "videocaptioningfromaudiomapp": [9, 13], "keep_original_sampl": 9, "caption": 9, "stream": 9, "qwen": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "__dj__video_audio_tags__": 9, "spectrogram": 9, "transform": 9, "hf": 9, "trust": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "temperatur": 9, "system_prompt": 9, "user_prompt": 9, "user_prompt_kei": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "control": 9, "output": 9, "prompt": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "respons": 9, "guid": 9, "uers_prompt_kei": 9, "punctuationnormalizationmapp": [9, 13], "unicod": 9, "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "latex": 9, "sentencesplitmapp": [9, 13], "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "cut": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "progress": 9, "cleanipmapp": [9, 13], "repl": 9, "clean": 9, "ipv4": 9, "ipv6": 9, "address": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "begin": 9, "drop": 9, "removetabletextmapp": [9, 13], "min_col": 9, "max_col": 9, "20": 9, "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "letter": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "hf_img2seq": 9, "blip2": 9, "opt": 9, "7b": 9, "point": 9, "fp16": 9, "bf16": 9, "branch": 9, "commit": 9, "id": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "produc": 9, "keep_candidate_mod": 9, "caption_num": 9, "candid": 9, "random_ani": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "kernel": 9, "videoffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "nlpcdazhmapp": [9, 13], "sequenti": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "delete_random_char": 9, "swap_random_char": 9, "replace_equivalent_num": 9, "simpli": 9, "nlpcda": 9, "librari": 9, "you": 9, "semant": 9, "significantli": 9, "notic": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "delet": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "swap": 9, "contigu": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "optimizeinstructionmapp": [9, 13], "hf_model": 9, "alibaba": 9, "pai": 9, "qwen2": 9, "instruct": 9, "enable_vllm": 9, "tensor_parallel_s": 9, "max_model_len": 9, "max_num_seq": 9, "sampling_param": 9, "5b": 9, "hugginfac": 9, "vllm": 9, "infer": 9, "acceler": 9, "valid": 9, "tensor": 9, "parallel": 9, "unspecifi": 9, "automat": 9, "deriv": 9, "top_p": 9, "imageblurmapp": [9, 13], "p": 9, "blure": 9, "cleancopyrightmapp": [9, 13], "copyright": 9, "comment": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "cleanhtmlmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "videotaggingfromframesmapp": [9, 13], "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "extractqamapp": [9, 13], "qwen1_5": 9, "doc2qa": 9, "qa_format": 9, "chatml": 9, "question": 9, "answer": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "suitabl": 9, "interfac": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imagecaptioningmapp": [9, 13], "prompt_kei": 9, "anoth": 9, "how": 9, "mani": 9, "similar_on": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "generateinstructionmapp": [9, 13], "chat": 9, "seed_fil": 9, "instruct_num": 9, "similarity_threshold": 9, "prompt_templ": 9, "qa_pair_templ": 9, "example_templ": 9, "qa_extraction_pattern": 9, "your": 9, "put": 9, "templat": 9, "make": 9, "sure": 9, "augmented_data": 9, "\u95ee\u9898": 9, "\u56de\u7b54": 9, "qa_pair": 9, "\u5982\u4e0b\u662f\u4e00\u6761\u793a\u4f8b\u6570\u636e": 9, "load_seed_qa_sampl": 9, "qa": 9, "build_prompt": 9, "qa_sampl": 9, "parse_chatml_str": 9, "input_str": 9, "parse_respons": 9, "response_str": 9, "max_rouge_l_scor": 9, "fixunicodemapp": [9, 13], "fix": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "nlpaugenmapp": [9, 13], "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "insert_random_char": 9, "nlpaug": 9, "love": 9, "llm": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "videocaptioningfromframesmapp": [9, 13], "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "cleanemailmapp": [9, 13], "email": 9, "replacecontentmapp": [9, 13], "design": 9, "audioffmpegwrappedmapp": [9, 13], "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videofaceblurmapp": [9, 13], "imagetaggingmapp": [9, 13], "__dj__image_tags__": 9, "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Adapter"], [3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "Monitor"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Adapter": [[3, 4, 1, "", "MAX_BATCH_SIZE"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "adapt_workloads"], [3, 2, 1, "", "batch_size_strategy"], [3, 2, 1, "", "execute_and_probe"], [3, 2, 1, "", "probe_small_batch"], [3, 2, 1, "", "take_batch"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.Monitor": [[3, 4, 1, "", "DYNAMIC_FIELDS"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "analyze_resource_util_list"], [3, 2, 1, "", "analyze_single_resource_util"], [3, 2, 1, "", "monitor_all_resources"], [3, 2, 1, "", "monitor_current_resources"], [3, 2, 1, "", "monitor_func"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "load_from_disk"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "EmptyFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RayEmptyFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.EmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RayEmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats_batched"], [5, 2, 1, "", "compute_stats_single"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats_single"], [7, 2, 1, "", "process_single"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceCountFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImagePairSimilarityFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceCountFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImagePairSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractQAMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "GenerateInstructionMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "ImageTaggingMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "OptimizeInstructionMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExtractQAMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.GenerateInstructionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_prompt"], [9, 2, 1, "", "load_seed_qa_samples"], [9, 2, 1, "", "max_rouge_l_score"], [9, 2, 1, "", "parse_chatml_str"], [9, 2, 1, "", "parse_response"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageTaggingMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.OptimizeInstructionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "}": [[3, "id1"], [3, "id2"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "adapter (class in data_juicer.core)": [[3, "data_juicer.core.Adapter"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "dynamic_fields (data_juicer.core.monitor attribute)": [[3, "data_juicer.core.Monitor.DYNAMIC_FIELDS"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "max_batch_size (data_juicer.core.adapter attribute)": [[3, "data_juicer.core.Adapter.MAX_BATCH_SIZE"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "monitor (class in data_juicer.core)": [[3, "data_juicer.core.Monitor"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.__init__"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "adapt_workloads() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.adapt_workloads"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "analyze_resource_util_list() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_resource_util_list"]], "analyze_single_resource_util() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_single_resource_util"]], "batch_size_strategy() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.batch_size_strategy"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "execute_and_probe() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.execute_and_probe"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "load_from_disk() (data_juicer.core.nesteddataset static method)": [[3, "data_juicer.core.NestedDataset.load_from_disk"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "monitor_all_resources() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.monitor_all_resources"]], "monitor_current_resources() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_current_resources"]], "monitor_func() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_func"]], "probe_small_batch() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.probe_small_batch"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "take_batch() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.take_batch"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "emptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.EmptyFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "rayemptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.RayEmptyFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.emptyformatter attribute)": [[4, "data_juicer.format.EmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.rayemptyformatter attribute)": [[4, "data_juicer.format.RayEmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "null_value (data_juicer.format.emptyformatter property)": [[4, "data_juicer.format.EmptyFormatter.null_value"]], "null_value (data_juicer.format.rayemptyformatter property)": [[4, "data_juicer.format.RayEmptyFormatter.null_value"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_single"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "process_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_batched"]], "process_batched() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_batched"]], "process_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_single"]], "process_single() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_single"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats_single"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "process_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process_single"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefacecountfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imagepairsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats_single"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process_batched"]], "process_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process_single"]], "process_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process_single"]], "process_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process_single"]], "process_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process_single"]], "process_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process_single"]], "process_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process_single"]], "process_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process_single"]], "process_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process_single"]], "process_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process_single"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractQAMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "generateinstructionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "imagetaggingmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "optimizeinstructionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeInstructionMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.optimizeinstructionmapper method)": [[9, "data_juicer.ops.mapper.OptimizeInstructionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "build_prompt() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.build_prompt"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "load_seed_qa_samples() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.load_seed_qa_samples"]], "max_rouge_l_score() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.max_rouge_l_score"]], "parse_chatml_str() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.parse_chatml_str"]], "parse_response() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.parse_response"]], "process_batched() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process_batched"]], "process_single() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.generateinstructionmapper method)": [[9, "data_juicer.ops.mapper.GenerateInstructionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.process_single"]], "process_single() (data_juicer.ops.mapper.optimizeinstructionmapper method)": [[9, "data_juicer.ops.mapper.OptimizeInstructionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process_single"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}})
\ No newline at end of file
+Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7, 8, 9], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 3, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 3, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 3, 7, 8, 9], "inform": [1, 3, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 8, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": [2, 9], "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": [2, 3], "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 3, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 3, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": [2, 3, 4, 9], "origin": [2, 3, 8, 9], "expect": [2, 3, 9], "cfg_after_merg": 2, "adapt": [3, 13], "max_batch_s": 3, "10000": 3, "static": 3, "execute_and_prob": 3, "oper": 3, "sample_interv": 3, "0": [3, 4, 5, 7, 8, 9], "5": [3, 7, 8, 9], "input": [3, 5, 7, 8, 9, 10], "probe": 3, "relat": [3, 8], "op": [3, 13], "specifi": [3, 4, 6, 8, 9, 10], "For": [3, 5, 7, 8, 9], "now": [3, 6, 9], "we": [3, 4, 7, 8, 9, 13], "support": [3, 8, 9], "follow": [3, 9], "target": [3, 8, 10], "resourc": 3, "util": 3, "speed": 3, "averag": [3, 8], "The": [3, 4, 5, 8, 9, 10], "item": [3, 5], "take_batch": 3, "config": [3, 5, 9, 13], "split": [3, 6, 9], "batch": [3, 9], "factor": 3, "set": [3, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "iter": [3, 8, 9], "adapt_workload": 3, "manag": 3, "schedul": 3, "balanc": 3, "need": [3, 6, 8, 9, 10], "recip": 3, "probe_small_batch": 3, "perform": 3, "small": [3, 8], "pre": 3, "execut": 3, "avail": [3, 8], "current": 3, "estim": 3, "rank": [3, 8, 9, 10], "A": [3, 5, 7, 9], "length": [3, 4, 8, 9], "batch_size_strategi": 3, "load_analysis_r": 3, "base_b": 3, "util_th": 3, "9": [3, 8, 9], "decid": [3, 5, 7, 8], "accord": [3, 4, 5, 8, 9], "workload": 3, "analysi": [3, 13], "threshold": [3, 7, 8, 9], "guarante": 3, "won": [3, 7], "t": [3, 4, 6, 7], "exce": [3, 8, 9], "onli": [3, 7, 8, 9], "consid": [3, 7, 8, 9], "bucket": 3, "effect": 3, "which": [3, 5, 7, 8, 9], "max": [3, 4, 7, 8, 9], "except": [3, 9], "gpu": 3, "thi": [3, 4, 5, 6, 7, 8, 9, 10], "It": [3, 4, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "work_dir": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 9, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "load_from_disk": 3, "wa": 3, "previous": 3, "save_to_disk": 3, "directori": [3, 4, 8], "filesystem": 3, "ani": [3, 8, 9], "implement": [3, 7], "fsspec": 3, "spec": 3, "abstractfilesystem": 3, "dataset_path": [3, 4], "str": [3, 4, 6, 7, 8, 9, 10], "train": [3, 9], "remot": [3, 9], "uri": 3, "s3": 3, "my": 3, "where": 3, "f": [3, 4], "instanc": [3, 5], "deprec": 3, "version": [3, 9], "2": [3, 6, 8, 9], "8": [3, 8, 9], "3": [3, 8, 9], "pleas": [3, 7, 9], "storage_opt": 3, "instead": [3, 4, 6], "keep_in_memori": 3, "bool": [3, 7, 8, 9, 10], "copi": 3, "memori": 3, "unless": 3, "explicitli": 3, "enabl": [3, 9], "in_memory_max_s": 3, "nonzero": 3, "see": [3, 13], "more": [3, 8, 9, 13], "detail": [3, 13], "improv": 3, "section": 3, "kei": [3, 4, 5, 8, 9, 10], "pair": [3, 5, 7, 8, 9], "pass": [3, 9], "system": [3, 9], "backend": 3, "ad": [3, 6, 9], "request": [3, 9], "datasetdict": 3, "exampl": [3, 8, 9], "py": [3, 4], "d": [3, 4], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 7, 8, 9, 10], "sample_algo": 3, "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "ratio": [3, 4, 6, 8, 9, 10], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "to_jsonl": 3, "jsonl": [3, 4], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "monitor": [3, 13], "other": [3, 8, 9], "dure": 3, "python": 3, "time": [3, 9], "10": [3, 8, 9], "timestamp": 3, "xxx": 3, "cpu": 3, "count": [3, 8], "free": 3, "mem": 3, "structur": 3, "abov": [3, 9], "field": [3, 4, 5, 7, 8, 9, 10], "first": [3, 6, 7, 8, 9], "level": [3, 5, 6, 7, 8, 9, 10], "resource_analysi": 3, "min": [3, 7, 8, 9], "avg": [3, 8], "those": [3, 8], "dynamic_field": 3, "monitor_all_resourc": 3, "detect": [3, 7, 8, 9], "node": 3, "monitor_current_resourc": 3, "machin": 3, "rang": [3, 8, 9, 10], "mb": [3, 8], "analyze_resource_util_list": 3, "resource_util_list": 3, "metric": [3, 5, 7, 8], "analyze_single_resource_util": 3, "resource_util_dict": 3, "monitor_func": 3, "show_num": [3, 5, 7], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "two": [3, 7, 8, 9], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "generated_dataset_config": [4, 9], "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "creat": 4, "provid": [4, 7, 9], "must": [4, 8, 9], "contain": [4, 6, 8, 9], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 7, 8, 9, 10], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "tsx": 4, "vb": 4, "makefil": 4, "xml": [4, 8, 9], "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8, 9], "datasset": 4, "dir": 4, "w1": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "random_sampl": 4, "sample_numb": 4, "seed": [4, 9], "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "random": [4, 9, 10], "42": 4, "emptyformatt": [4, 9, 13], "feature_kei": [4, 9], "empti": [4, 7, 9], "featur": 4, "properti": 4, "null_valu": 4, "rayemptyformatt": [4, 9, 13], "rai": [4, 7, 9], "load_op": [5, 13], "process_list": 5, "op_fus": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stats_batch": [5, 8], "process_batch": [5, 8, 9], "compute_stats_singl": [5, 7, 8], "context": [5, 7, 8, 9], "var": [5, 7, 8], "temporarili": [5, 7, 8], "process_singl": [5, 7, 8, 9], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "documentdedupl": [7, 13], "ignore_non_charact": 7, "exact": 7, "match": [7, 8, 9], "md5": 7, "ignor": [7, 9], "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "punctuat": [7, 9], "documentminhashdedupl": [7, 13], "window_s": 7, "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "kept": [7, 8, 9], "final": [7, 9], "should": [7, 8, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "shingl": 7, "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "jaccard": 7, "similar": [7, 8, 9], "regard": 7, "band": 7, "lsh": 7, "determin": [7, 9, 10], "optim": [7, 9], "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "imagededupl": [7, 13], "phash": 7, "consider_text": 7, "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "6380": 7, "basic": 7, "although": 7, "empty_hash_valu": 7, "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "raydocumentdedupl": [7, 13], "rayimagededupl": [7, 13], "rayvideodedupl": [7, 13], "videodedupl": [7, 13], "alphanumericfilt": [8, 13], "min_ratio": [8, 9], "25": 8, "max_ratio": [8, 9], "9223372036854775807": [8, 9], "numer": 8, "within": [8, 9, 10], "alphanumer": 8, "total": [8, 9], "below": [8, 9], "audiodurationfilt": [8, 13], "min_dur": 8, "max_dur": 8, "any_or_al": [8, 9], "durat": [8, 9], "second": [8, 9], "sy": 8, "maxsiz": 8, "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "averagelinelengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "flaggedwordfilt": [8, 13], "lang": [8, 9], "045": 8, "flagged_words_dir": 8, "home": 8, "runner": 8, "asset": 8, "flag": 8, "what": 8, "adopt": 8, "flagged_word": 8, "join": 8, "imageaestheticsfilt": [8, 13], "hf_scorer_model": 8, "trust_remote_cod": [8, 9], "min_scor": 8, "max_scor": 8, "aesthet": 8, "score": [8, 9], "predictor": 8, "By": [8, 9], "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "keyword": [8, 9], "imageaspectratiofilt": [8, 13], "333": 8, "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "imagefacecountfilt": [8, 13], "cv_classifi": [8, 9], "min_face_count": 8, "max_face_count": 8, "face": [8, 9], "opencv": [8, 9], "classifi": [8, 9], "haarcascade_frontalface_alt": [8, 9], "minimum": [8, 9], "requir": 8, "imagefaceratiofilt": [8, 13], "area": 8, "largest": [8, 10], "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "imagepairsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "closedunitinterv": 8, "imageshapefilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "shape": 8, "width": [8, 9], "height": [8, 9], "imagesizefilt": [8, 13], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "reduce_mod": 8, "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 9, 10], "chunk": 8, "take": 8, "imagetextsimilarityfilt": [8, 13], "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "watermark": [8, 9], "high": 8, "probabl": [8, 9], "languageidscorefilt": [8, 13], "confid": 8, "larger": [8, 9, 10], "identif": 8, "maximumlinelengthfilt": [8, 13], "perplexityfilt": [8, 13], "max_ppl": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": [8, 9], "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "out": 8, "larg": 8, "account": 8, "specialcharactersfilt": [8, 13], "specifiedfieldfilt": [8, 13], "field_kei": [8, 10], "target_valu": 8, "multi": [8, 10, 13], "retain": [8, 9], "specifiednumericfieldfilt": [8, 13], "min_valu": 8, "max_valu": 8, "specifiednumericfield": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "suffixfilt": [8, 13], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "zh": 8, "mini_action_num": 8, "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "mini_dependency_num": 8, "edg": 8, "depend": [8, 9], "objet": 8, "textlengthfilt": [8, 13], "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "hug": [8, 9], "videoaestheticsfilt": [8, 13], "frame_sampling_method": [8, 9], "frame_num": [8, 9], "frame": [8, 9], "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "while": 8, "usual": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "addit": [8, 9], "videoaspectratiofilt": [8, 13], "21": [8, 9], "videodurationfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "chineseclip": 8, "might": [8, 9], "choic": 8, "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "tupl": 8, "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "sequenc": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "equal": [8, 9, 10], "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "videonsfwfilt": [8, 13], "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "found": [8, 9], "here": [8, 9, 13], "http": [8, 9], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videoresolutionfilt": [8, 13], "resolut": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "tag_field_nam": [8, 9], "__dj__video_frame_tags__": [8, 9], "shift": 8, "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "videowatermarkfilt": [8, 13], "wordrepetitionfilt": [8, 13], "wordsnumfilt": [8, 13], "audioffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "output": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "cleancopyrightmapp": [9, 13], "clean": 9, "copyright": 9, "comment": 9, "begin": 9, "cleanemailmapp": [9, 13], "repl": 9, "email": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanhtmlmapp": [9, 13], "cleanipmapp": [9, 13], "ipv4": 9, "ipv6": 9, "address": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "latex": 9, "fixunicodemapp": [9, 13], "fix": 9, "unicod": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "generateqafromexamplesmapp": [9, 13], "hf_model": 9, "qwen": 9, "qwen2": 9, "7b": 9, "instruct": 9, "seed_fil": 9, "example_num": 9, "similarity_threshold": 9, "system_prompt": 9, "input_templ": 9, "example_templ": 9, "qa_pair_templ": 9, "output_pattern": 9, "enable_vllm": 9, "model_param": 9, "sampling_param": 9, "question": 9, "answer": 9, "you": 9, "your": 9, "default_system_prompt": 9, "\u8bf7\u4f60\u4ed4\u7ec6\u89c2\u5bdf\u591a\u4e2a\u793a\u4f8b\u6570\u636e\u7684\u8f93\u5165\u548c\u8f93\u51fa": 9, "\u6309\u7167\u4f60\u7684\u7406\u89e3": 9, "\u603b\u7ed3\u51fa\u76f8\u5e94\u89c4\u77e9": 9, "\u7136\u540e\u5199\u51fa\u4e00\u4e2a\u65b0\u7684": 9, "\u95ee\u9898": 9, "\u548c": 9, "\u56de\u7b54": 9, "\u6ce8\u610f": 9, "\u65b0\u751f\u6210\u7684": 9, "\u9700\u8981\u6ee1\u8db3\u5982\u4e0b\u8981\u6c42": 9, "n1": 9, "\u751f\u6210\u7684": 9, "\u4e0d\u80fd\u4e0e\u8f93\u5165\u7684": 9, "\u4e00\u81f4": 9, "\u4f46\u662f\u9700\u8981\u4fdd\u6301\u683c\u5f0f\u76f8\u540c": 9, "n2": 9, "\u4e0d\u4e00\u5b9a\u8981\u5c40\u9650\u4e8e\u8f93\u5165": 9, "\u7684\u8bdd\u9898\u6216\u9886\u57df": 9, "\u9700\u8981\u6b63\u786e\u56de\u7b54\u751f\u6210\u7684": 9, "n3": 9, "\u63d0\u4f9b\u7684": 9, "\u53ef\u80fd\u662f\u591a\u8f6e\u5bf9\u8bdd": 9, "\u4e5f\u53ef\u4ee5\u662f\u591a\u8f6e": 9, "n4": 9, "\u5fc5\u987b\u6210\u5bf9\u51fa\u73b0": 9, "\u800c\u4e14": 9, "\u9700\u8981\u5728": 9, "\u4e4b\u524d": 9, "default_input_templ": 9, "default_example_templ": 9, "n\u5982\u4e0b\u662f\u4e00\u6761\u793a\u4f8b\u6570\u636e": 9, "default_qa_pair_templ": 9, "default_output_pattern": 9, "hugginfac": 9, "id": 9, "chatml": 9, "put": 9, "prompt": 9, "qa": 9, "guid": 9, "task": 9, "templat": 9, "build": 9, "placehold": 9, "defin": 9, "qa_pair": 9, "respons": 9, "vllm": 9, "infer": 9, "acceler": 9, "temperatur": 9, "top_p": 9, "build_input": 9, "qa_exampl": 9, "parse_output": 9, "raw_output": 9, "generateqafromtextmapp": [9, 13], "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "suitabl": 9, "interfac": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imageblurmapp": [9, 13], "p": 9, "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "blure": 9, "kernel": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "user_prompt": 9, "user_prompt_kei": 9, "keep_original_sampl": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "control": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "uers_prompt_kei": 9, "imagecaptioningmapp": [9, 13], "hf_img2seq": 9, "blip2": 9, "opt": 9, "caption_num": 9, "keep_candidate_mod": 9, "random_ani": 9, "prompt_kei": 9, "caption": 9, "anoth": 9, "how": 9, "mani": 9, "candid": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "similar_on": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "point": 9, "fp16": 9, "bf16": 9, "branch": 9, "commit": 9, "git": 9, "extent": 9, "transform": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "produc": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "imagetaggingmapp": [9, 13], "__dj__image_tags__": 9, "nlpaugenmapp": [9, 13], "sequenti": 9, "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "delete_random_char": 9, "swap_random_char": 9, "insert_random_char": 9, "simpli": 9, "nlpaug": 9, "librari": 9, "semant": 9, "significantli": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "delet": 9, "love": 9, "llm": 9, "swap": 9, "contigu": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "nlpcdazhmapp": [9, 13], "replace_similar_word": 9, "replace_homophone_char": 9, "replace_equivalent_num": 9, "nlpcda": 9, "notic": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "optimizeqamapp": [9, 13], "\u8bf7\u4f18\u5316\u8f93\u5165\u7684\u95ee\u7b54\u5bf9": 9, "\u4f7f": 9, "\u90fd\u66f4\u52a0\u8be6\u7ec6": 9, "\u51c6\u786e": 9, "\u5fc5\u987b\u6309\u7167\u4ee5\u4e0b\u6807\u8bb0\u683c\u5f0f": 9, "\u76f4\u63a5\u8f93\u51fa\u4f18\u5316\u540e\u7684\u95ee\u7b54\u5bf9": 9, "n\u4f18\u5316\u540e\u7684\u95ee\u9898": 9, "n\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "\u4ee5\u4e0b\u662f\u539f\u59cb\u95ee\u7b54\u5bf9": 9, "make": 9, "sure": 9, "optimizequerymapp": [9, 13], "queri": 9, "\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684": 9, "\u5c06\u5176\u66f4\u52a0\u8be6\u7ec6\u5177\u4f53": 9, "\u4f46\u4ecd\u53ef\u4ee5\u7531\u539f\u7b54\u6848\u56de\u7b54": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684": 9, "\u4e0d\u8981\u8f93\u51fa\u591a\u4f59\u5185\u5bb9": 9, "optimizeresponsemapp": [9, 13], "\u8bf7\u4f18\u5316\u95ee\u7b54\u5bf9\u4e2d\u7684\u56de\u7b54": 9, "\u4f46\u4ecd\u53ef\u4ee5\u56de\u7b54\u539f\u95ee\u9898": 9, "\u53ea\u8f93\u51fa\u4f18\u5316\u540e\u7684\u56de\u7b54": 9, "punctuationnormalizationmapp": [9, 13], "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "drop": 9, "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "letter": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "removetabletextmapp": [9, 13], "min_col": 9, "max_col": 9, "20": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "replacecontentmapp": [9, 13], "design": 9, "sentencesplitmapp": [9, 13], "videocaptioningfromaudiomapp": [9, 13], "stream": 9, "videocaptioningfromframesmapp": [9, 13], "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videoffmpegwrappedmapp": [9, 13], "videofaceblurmapp": [9, 13], "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "cut": 9, "split_videos_by_dur": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "progress": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "__dj__video_audio_tags__": 9, "spectrogram": 9, "hf": 9, "trust": 9, "videotaggingfromframesmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Adapter"], [3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "Monitor"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Adapter": [[3, 4, 1, "", "MAX_BATCH_SIZE"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "adapt_workloads"], [3, 2, 1, "", "batch_size_strategy"], [3, 2, 1, "", "execute_and_probe"], [3, 2, 1, "", "probe_small_batch"], [3, 2, 1, "", "take_batch"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.Monitor": [[3, 4, 1, "", "DYNAMIC_FIELDS"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "analyze_resource_util_list"], [3, 2, 1, "", "analyze_single_resource_util"], [3, 2, 1, "", "monitor_all_resources"], [3, 2, 1, "", "monitor_current_resources"], [3, 2, 1, "", "monitor_func"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "load_from_disk"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "EmptyFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RayEmptyFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.EmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RayEmptyFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 5, 1, "", "null_value"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats_batched"], [5, 2, 1, "", "compute_stats_single"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process_batched"], [5, 2, 1, "", "process_single"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats_single"], [7, 2, 1, "", "process_single"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceCountFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImagePairSimilarityFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceCountFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImagePairSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_single"], [8, 2, 1, "", "process_single"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats_batched"], [8, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "GenerateQAFromExamplesMapper"], [9, 1, 1, "", "GenerateQAFromTextMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "ImageTaggingMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "OptimizeQAMapper"], [9, 1, 1, "", "OptimizeQueryMapper"], [9, 1, 1, "", "OptimizeResponseMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.GenerateQAFromExamplesMapper": [[9, 4, 1, "", "DEFAULT_EXAMPLE_TEMPLATE"], [9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.GenerateQAFromTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.ImageTaggingMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.OptimizeQAMapper": [[9, 4, 1, "", "DEFAULT_INPUT_TEMPLATE"], [9, 4, 1, "", "DEFAULT_OUTPUT_PATTERN"], [9, 4, 1, "", "DEFAULT_QA_PAIR_TEMPLATE"], [9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "build_input"], [9, 2, 1, "", "parse_output"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.OptimizeQueryMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.OptimizeResponseMapper": [[9, 4, 1, "", "DEFAULT_SYSTEM_PROMPT"], [9, 2, 1, "", "parse_output"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_single"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process_batched"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute", "5": "py:property"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"], "5": ["py", "property", "Python property"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "}": [[3, "id1"], [3, "id2"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "adapter (class in data_juicer.core)": [[3, "data_juicer.core.Adapter"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "dynamic_fields (data_juicer.core.monitor attribute)": [[3, "data_juicer.core.Monitor.DYNAMIC_FIELDS"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "max_batch_size (data_juicer.core.adapter attribute)": [[3, "data_juicer.core.Adapter.MAX_BATCH_SIZE"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "monitor (class in data_juicer.core)": [[3, "data_juicer.core.Monitor"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.__init__"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "adapt_workloads() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.adapt_workloads"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "analyze_resource_util_list() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_resource_util_list"]], "analyze_single_resource_util() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.analyze_single_resource_util"]], "batch_size_strategy() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.batch_size_strategy"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "execute_and_probe() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.execute_and_probe"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "load_from_disk() (data_juicer.core.nesteddataset static method)": [[3, "data_juicer.core.NestedDataset.load_from_disk"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "monitor_all_resources() (data_juicer.core.monitor method)": [[3, "data_juicer.core.Monitor.monitor_all_resources"]], "monitor_current_resources() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_current_resources"]], "monitor_func() (data_juicer.core.monitor static method)": [[3, "data_juicer.core.Monitor.monitor_func"]], "probe_small_batch() (data_juicer.core.adapter method)": [[3, "data_juicer.core.Adapter.probe_small_batch"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "take_batch() (data_juicer.core.adapter static method)": [[3, "data_juicer.core.Adapter.take_batch"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "emptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.EmptyFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "rayemptyformatter (class in data_juicer.format)": [[4, "data_juicer.format.RayEmptyFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.emptyformatter attribute)": [[4, "data_juicer.format.EmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.rayemptyformatter attribute)": [[4, "data_juicer.format.RayEmptyFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.emptyformatter method)": [[4, "data_juicer.format.EmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.rayemptyformatter method)": [[4, "data_juicer.format.RayEmptyFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "null_value (data_juicer.format.emptyformatter property)": [[4, "data_juicer.format.EmptyFormatter.null_value"]], "null_value (data_juicer.format.rayemptyformatter property)": [[4, "data_juicer.format.RayEmptyFormatter.null_value"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats_single"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "process_batched() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_batched"]], "process_batched() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_batched"]], "process_single() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process_single"]], "process_single() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process_single"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats_single"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "process_single() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process_single"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefacecountfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imagepairsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats_batched"]], "compute_stats_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats_batched"]], "compute_stats_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats_single"]], "compute_stats_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats_single"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process_batched() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process_batched"]], "process_batched() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process_batched"]], "process_single() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process_single"]], "process_single() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefacecountfilter method)": [[8, "data_juicer.ops.filter.ImageFaceCountFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagepairsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImagePairSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process_single"]], "process_single() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process_single"]], "process_single() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process_single"]], "process_single() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process_single"]], "process_single() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process_single"]], "process_single() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process_single"]], "process_single() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process_single"]], "process_single() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process_single"]], "process_single() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process_single"]], "process_single() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process_single"]], "process_single() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process_single"]], "process_single() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process_single"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "default_example_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_EXAMPLE_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_INPUT_TEMPLATE"]], "default_input_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_INPUT_TEMPLATE"]], "default_output_pattern (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_OUTPUT_PATTERN"]], "default_output_pattern (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_OUTPUT_PATTERN"]], "default_qa_pair_template (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_qa_pair_template (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_QA_PAIR_TEMPLATE"]], "default_system_prompt (data_juicer.ops.mapper.generateqafromexamplesmapper attribute)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeqamapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizequerymapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.DEFAULT_SYSTEM_PROMPT"]], "default_system_prompt (data_juicer.ops.mapper.optimizeresponsemapper attribute)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.DEFAULT_SYSTEM_PROMPT"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "generateqafromexamplesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper"]], "generateqafromtextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "imagetaggingmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "optimizeqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper"]], "optimizequerymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper"]], "optimizeresponsemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "build_input() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.build_input"]], "build_input() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.build_input"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "parse_output() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizequerymapper method)": [[9, "data_juicer.ops.mapper.OptimizeQueryMapper.parse_output"]], "parse_output() (data_juicer.ops.mapper.optimizeresponsemapper method)": [[9, "data_juicer.ops.mapper.OptimizeResponseMapper.parse_output"]], "process_batched() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.generateqafromtextmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process_batched"]], "process_batched() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process_batched"]], "process_single() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.generateqafromexamplesmapper method)": [[9, "data_juicer.ops.mapper.GenerateQAFromExamplesMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.imagetaggingmapper method)": [[9, "data_juicer.ops.mapper.ImageTaggingMapper.process_single"]], "process_single() (data_juicer.ops.mapper.optimizeqamapper method)": [[9, "data_juicer.ops.mapper.OptimizeQAMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process_single"]], "process_single() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process_single"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}})
\ No newline at end of file