From e3b3a6d2812eb38f5cc05dbeb3140432006c3039 Mon Sep 17 00:00:00 2001 From: drcege Date: Tue, 5 Nov 2024 04:03:07 +0000 Subject: [PATCH] deploy: 65d7c918357a4ad14538ec35211bcac106f53d4c --- _modules/data_juicer/core/data.html | 21 +- _modules/data_juicer/ops/base_op.html | 9 + .../ops/mapper/extract_qa_mapper.html | 275 -- ... => generate_qa_from_examples_mapper.html} | 330 ++- .../mapper/generate_qa_from_text_mapper.html | 255 ++ .../mapper/optimize_instruction_mapper.html | 224 -- .../ops/mapper/optimize_qa_mapper.html | 254 ++ .../ops/mapper/optimize_query_mapper.html | 129 + .../ops/mapper/optimize_response_mapper.html | 129 + _modules/index.html | 8 +- data_juicer.ops.deduplicator.html | 390 +-- data_juicer.ops.filter.html | 2120 +++++++-------- data_juicer.ops.mapper.html | 2303 ++++++++--------- genindex.html | 92 +- index.html | 154 +- objects.inv | Bin 5920 -> 6052 bytes searchindex.js | 2 +- 17 files changed, 3506 insertions(+), 3189 deletions(-) delete mode 100644 _modules/data_juicer/ops/mapper/extract_qa_mapper.html rename _modules/data_juicer/ops/mapper/{generate_instruction_mapper.html => generate_qa_from_examples_mapper.html} (50%) create mode 100644 _modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html delete mode 100644 _modules/data_juicer/ops/mapper/optimize_instruction_mapper.html create mode 100644 _modules/data_juicer/ops/mapper/optimize_qa_mapper.html create mode 100644 _modules/data_juicer/ops/mapper/optimize_query_mapper.html create mode 100644 _modules/data_juicer/ops/mapper/optimize_response_mapper.html diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html index f20745f11..c8a2a259f 100644 --- a/_modules/data_juicer/core/data.html +++ b/_modules/data_juicer/core/data.html @@ -325,9 +325,10 @@

Source code for data_juicer.core.data

 
         if inspect.ismethod(called_func):
             # batched is required for fault-tolerant or batched OP
-            if not called_func.__self__.turbo or hasattr(
+            if callable(getattr(
                     called_func.__self__,
-                    'is_batched_op') and called_func.__self__.is_batched_op():
+                    'is_batched_op')) and called_func.__self__.is_batched_op(
+                    ) or not called_func.__self__.turbo:
                 kargs['batched'] = True
                 kargs['batch_size'] = kargs.pop('batch_size', 1) if hasattr(
                     called_func.__self__, 'is_batched_op'
@@ -335,6 +336,12 @@ 

Source code for data_juicer.core.data

             else:
                 kargs['batched'] = False
 
+            # rank is required for cuda model loading
+            if callable(
+                    getattr(called_func.__self__,
+                            'use_cuda')) and called_func.__self__.use_cuda():
+                kargs['with_rank'] = True
+
         if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None:
             new_fingerprint = generate_fingerprint(self, *args, **kargs)
             kargs['new_fingerprint'] = new_fingerprint
@@ -379,10 +386,12 @@ 

Source code for data_juicer.core.data

             called_func = called_func.__wrapped__
 
         # Batched is always required for fault tolerance
-        if inspect.ismethod(
-                called_func) and called_func.__self__.is_batched_op():
-            kargs['batched'] = True
-            kargs['batch_size'] = kargs.pop('batch_size', 1)
+        if inspect.ismethod(called_func):
+            if callable(getattr(
+                    called_func.__self__,
+                    'is_batched_op')) and called_func.__self__.is_batched_op():
+                kargs['batched'] = True
+                kargs['batch_size'] = kargs.pop('batch_size', 1)
 
         if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None:
             new_fingerprint = generate_fingerprint(self, *args, **kargs)
diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html
index d6b0bf3f8..d736b81d0 100644
--- a/_modules/data_juicer/ops/base_op.html
+++ b/_modules/data_juicer/ops/base_op.html
@@ -81,6 +81,7 @@ 

Source code for data_juicer.ops.base_op

 import traceback
 from functools import wraps
 
+import numpy as np
 import pyarrow as pa
 from loguru import logger
 
@@ -212,6 +213,11 @@ 

Source code for data_juicer.ops.base_op

         self.image_key = kwargs.get('image_key', 'images')
         self.audio_key = kwargs.get('audio_key', 'audios')
         self.video_key = kwargs.get('video_key', 'videos')
+
+        self.query_key = kwargs.get('query_key', 'query')
+        self.response_key = kwargs.get('response_key', 'response')
+        self.history_key = kwargs.get('history_key', 'history')
+
         self.batch_size = kwargs.get('batch_size', 1000)
 
         # whether the model can be accelerated using cuda
@@ -289,6 +295,9 @@ 

Source code for data_juicer.ops.base_op

             dataset = NestedDataset(dataset)
         return dataset
 
+    def empty_history(self):
+        return np.empty((0, 0), dtype=str)
+
 
 
[docs]class Mapper(OP): diff --git a/_modules/data_juicer/ops/mapper/extract_qa_mapper.html b/_modules/data_juicer/ops/mapper/extract_qa_mapper.html deleted file mode 100644 index 4984076cb..000000000 --- a/_modules/data_juicer/ops/mapper/extract_qa_mapper.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.extract_qa_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.extract_qa_mapper

-import json
-import re
-from typing import Dict, Optional
-
-from loguru import logger
-
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-torch = LazyLoader('torch', 'torch')
-vllm = LazyLoader('vllm', 'vllm')
-
-OP_NAME = 'extract_qa_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class ExtractQAMapper(Mapper): - """ - Mapper to extract question and answer pair from text samples. - Recommended model list: [ - 'alibaba-pai/pai-llama3-8b-doc2qa', - 'alibaba-pai/pai-baichuan2-7b-doc2qa', - 'alibaba-pai/pai-qwen1_5-4b-doc2qa', - 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - 'alibaba-pai/pai-qwen1_5-1b8-doc2qa', - 'alibaba-pai/pai-qwen1_5-0b5-doc2qa' - ] - These recommended models are all trained with Chinese data - and are suitable for Chinese. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - trust_remote_code: bool = False, - pattern: Optional[str] = None, - qa_format: str = 'chatml', - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - *args, - **kwargs): - """ - Initialization method. - :param hf_model: Hugginface model id. - :param trust_remote_code: passed to transformers - :param pattern: regular expression pattern to search for within text. - :param qa_format: Output format of question and answer pair. - :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. - :param sampling_params: Sampling parameters for text generation. - e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args - - The default data format parsed by this interface is as follows: - Model Input: - 蒙古国的首都是乌兰巴托(Ulaanbaatar) - 冰岛的首都是雷克雅未克(Reykjavik) - Model Output: - 蒙古国的首都是乌兰巴托(Ulaanbaatar) - 冰岛的首都是雷克雅未克(Reykjavik) - Human: 请问蒙古国的首都是哪里? - Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 - Human: 冰岛的首都是哪里呢? - Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 - ... - """ - - super().__init__(*args, **kwargs) - self.num_proc = 1 - - if pattern is None: - self.pattern = r'Human: (.*?)\nAssistant: (.*?)(?=\nHuman|$)' - else: - self.pattern = pattern - - self.qa_format = qa_format - self.enable_vllm = enable_vllm - - if enable_vllm: - - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') - self.model_key = prepare_model( - model_type='vllm', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) - self.sampling_params = vllm.SamplingParams(**sampling_params) - else: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) - self.sampling_params = sampling_params
- - def _extract_qa(self, output): - """Extract qestion and answer pair from model output response.""" - qa_list = [] - - pat = re.compile(self.pattern, re.DOTALL) - qa_pairs = pat.findall(output) - - for _, qa in enumerate(qa_pairs, 1): - user, assistant = qa - qa_list.append((user.strip(), assistant.strip())) - - return qa_list - -
[docs] def process_single(self, sample, rank=None): - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - if self.enable_vllm: - response = model.generate([sample[self.text_key]], - self.sampling_params) - output = response[0].outputs[0].text - else: - inputs = processor(sample[self.text_key], - return_tensors='pt').to(model.device) - response = model.generate(**inputs, **self.sampling_params) - output = processor.decode(response.cpu()[0], - skip_special_tokens=True) - - qa_list = self._extract_qa(output) - - if not len(qa_list): - logger.info( - 'No question and answer data was extracted from this sample!') - - dialogue_data = [] - if self.qa_format == 'chatml': - for qa in qa_list: - dialogue_data.append({ - 'messages': [{ - 'role': 'user', - 'content': qa[0] - }, { - 'role': 'assistant', - 'content': qa[1] - }] - }) - else: - raise ValueError(f'Not support {self.qa_format}!') - - sample[self.text_key] = json.dumps(dialogue_data, ensure_ascii=False) - - return sample
-
- -
-
-
- -
- -
-

© Copyright 2024, Data-Juicer Team.

-
- - Built with Sphinx using a - theme - provided by Read the Docs. - - -
-
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/generate_instruction_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html similarity index 50% rename from _modules/data_juicer/ops/mapper/generate_instruction_mapper.html rename to _modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html index e97503fa1..455dfa757 100644 --- a/_modules/data_juicer/ops/mapper/generate_instruction_mapper.html +++ b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html @@ -5,7 +5,7 @@ - data_juicer.ops.mapper.generate_instruction_mapper — data_juicer 0.2.0 documentation + data_juicer.ops.mapper.generate_qa_from_examples_mapper — data_juicer 0.2.0 documentation @@ -67,7 +67,7 @@
  • - +
  • @@ -76,7 +76,7 @@
    -

    Source code for data_juicer.ops.mapper.generate_instruction_mapper

    +  

    Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

     import json
     import random
     import re
    @@ -94,26 +94,15 @@ 

    Source code for data_juicer.ops.mapper.generate_instruction_mapper

    vllm = LazyLoader('vllm', 'vllm') rouge = LazyLoader('rouge', 'rouge') -DEFAULT_PROMPT_TEMPLATE = """ -请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求: -1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。 -2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。 -3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。 -4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。 -{augmented_data} -""" -QA_EXTRACTION_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*?)\s*(?=【问题】|$)' -EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n\n{qa_pairs}' -QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' - -OP_NAME = 'generate_instruction_mapper' +OP_NAME = 'generate_qa_from_examples_mapper' # TODO: Extend LLM-based OPs into API-based implementation. -
    [docs]@UNFORKABLE.register_module(OP_NAME) +
    [docs]@UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) -class GenerateInstructionMapper(Mapper): - """Mapper to generate new instruction text data. +class GenerateQAFromExamplesMapper(Mapper): + """ + Mapper to generate question and answer pairs from examples. You should configure an empty dataset in your yaml config file: ``` generated_dataset_config: @@ -124,161 +113,148 @@

    Source code for data_juicer.ops.mapper.generate_instruction_mapper

    The number of samples generated is determined by the length of the empty dataset. """ + + DEFAULT_SYSTEM_PROMPT = ( + '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。' + '注意,新生成的【问题】和【回答】需要满足如下要求:\n' + '1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n' + '2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n' + '3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n' + '4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n') + + DEFAULT_INPUT_TEMPLATE = '{}' + DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}' + DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' + DEFAULT_OUTPUT_PATTERN = r'【问题】(.*?)【回答】(.*?)(?=【问题】|$)' + _accelerator = 'cuda' -
    [docs] def __init__(self, - hf_model: str = 'Qwen/Qwen-7B-Chat', +
    [docs] def __init__(self, + hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', + *, seed_file: str = '', - instruct_num: PositiveInt = 3, - trust_remote_code: bool = False, + example_num: PositiveInt = 3, similarity_threshold: float = 0.7, - prompt_template: Optional[str] = None, - qa_pair_template: Optional[str] = None, + system_prompt: Optional[str] = None, + input_template: Optional[str] = None, example_template: Optional[str] = None, - qa_extraction_pattern: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - *args, + qa_pair_template: Optional[str] = None, + output_pattern: Optional[str] = None, + enable_vllm: bool = False, + model_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, **kwargs): """ Initialization method. - :param hf_model: Hugginface model id. - :param seed_file: Seed file path, chatml format. - :param instruct_num: The number of instruction samples. - Randomly select N samples from "seed_file" and - put them into prompt as instruction samples. - :param trust_remote_code: passed to transformers + :param hf_model: Hugginface model ID. + :param seed_file: Path to the seed file in chatml format. + :param example_num: The number of selected examples. + Randomly select N examples from "seed_file" and + put them into prompt as QA examples. :param similarity_threshold: The similarity score threshold - between the generated samples and the seed samples. + between the generated samples and the seed examples. Range from 0 to 1. Samples with similarity score less than this threshold will be kept. - :param prompt_template: Prompt template for generate samples. - Please make sure the template contains "{augmented_data}", - which corresponds to the augmented samples. - :param qa_pair_template: Prompt template for generate question - and answer pair description. Please make sure the template - contains two "{}" to format question and answer. - Default: '【问题】\n{}\n【回答】\n{}\n'. - :param example_template: Prompt template for generate examples. - Please make sure the template contains "{qa_pairs}", which - corresponds to the question and answer pair description - generated by param `qa_pair_template`. - Default: '\n如下是一条示例数据:\n\n{qa_pairs}' - :param qa_extraction_pattern: Regular expression pattern for parsing - question and answer from model response. + :param system_prompt: System prompt for guiding the generation task. + :param input_template: Template for building the input prompt. It must + include one placeholder '{}', which will be replaced by + `example_num` formatted examples defined by `example_template`. + :param example_template: Template for formatting one QA example. It + must include one placeholder '{}', which will be replaced by one + formatted qa_pair. + :param qa_pair_template: Template for formatting a single QA pair + within each example. Must include two placeholders '{}' for the + question and answer. + :param output_pattern: Regular expression pattern to extract questions + and answers from model response. :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. + :param model_params: Parameters for initializing the model. :param sampling_params: Sampling parameters for text generation. e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args + :param kwargs: Extra keyword arguments. """ - super().__init__(*args, **kwargs) - self.num_proc = 1 + super().__init__(**kwargs) if not seed_file: raise ValueError( 'Please provide `seed_file` in chatml format.' 'Example: data-juicer/demos/data/demo-dataset-chatml.jsonl') - self.instruct_num = instruct_num + self.seed_file = seed_file + self.example_num = example_num self.similarity_threshold = similarity_threshold self.similarity_type = 'rouge_l' - if prompt_template is None: - prompt_template = DEFAULT_PROMPT_TEMPLATE - if qa_pair_template is None: - qa_pair_template = QA_PAIR_TEMPLATE - if example_template is None: - example_template = EXAMPLE_TEMPLATE - if qa_extraction_pattern is None: - qa_extraction_pattern = QA_EXTRACTION_PATTERN - - self.prompt_template = prompt_template - self.qa_pair_template = qa_pair_template - self.example_template = example_template - self.qa_extraction_pattern = qa_extraction_pattern + self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT + self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE + self.example_template = example_template or self.DEFAULT_EXAMPLE_TEMPLATE # noqa: E501 + self.qa_pair_template = qa_pair_template or \ + self.DEFAULT_QA_PAIR_TEMPLATE + self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN self.enable_vllm = enable_vllm + model_params = model_params or {} + sampling_params = sampling_params or {} if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: tensor_parallel_size = torch.cuda.device_count() logger.info(f'Set tensor_parallel_size to \ {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size self.model_key = prepare_model( model_type='vllm', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) + **model_params) self.sampling_params = vllm.SamplingParams(**sampling_params) else: self.model_key = prepare_model( model_type='huggingface', pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) + return_pipe=True, + **model_params) self.sampling_params = sampling_params - self.seed_qa_samples = self.load_seed_qa_samples(seed_file) - + self.seed_qa_samples = self._load_seed_qa_samples() if len(self.seed_qa_samples) == 0: - raise ValueError('No QA data was parsed from the seed file!') + raise ValueError('No QA data was parsed from the seed file!')
    - self.reference_samples = [ - '\n'.join(['\n'.join(qa_pair) for qa_pair in qa_pairs]) + '\n' - for qa_pairs in self.seed_qa_samples - ]
    - -
    [docs] def load_seed_qa_samples(self, seed_file): + def _load_seed_qa_samples(self): """Load QA pairs from chatml format file.""" qa_samples = [] - with open(seed_file) as f: + with open(self.seed_file, encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.strip() - qa_pairs = self.parse_chatml_str(line) + qa_pairs = self._parse_chatml_str(line) if len(qa_pairs) > 0: qa_samples.append(qa_pairs) + return qa_samples - return qa_samples
    - -
    [docs] def build_prompt(self, qa_samples, prompt_template): + def _sample_to_str(self, qa_sample): + return '\n'.join(['\n'.join(qa_pair) for qa_pair in qa_sample]) + '\n' - def format_qa_pairs(qa_pairs): - return ''.join([ - self.qa_pair_template.format(q, a) for q, a in qa_pairs - if q and a - ]) - - body_fragments = [ - self.example_template.format(qa_pairs=format_qa_pairs(qa_pairs)) - for qa_pairs in qa_samples - ] - - body = ''.join(body_fragments) - - return prompt_template.format(augmented_data=body)
    + def _max_rouge_l_score(self, hypothesis, references): + r = rouge.Rouge() + max_score = 0.0 + hyp_str = self._sample_to_str(hypothesis) + for reference in references: + ref_str = self._sample_to_str(reference) + scores = r.get_scores(hyp_str, ref_str) + rouge_l_score = scores[0]['rouge-l']['f'] + if rouge_l_score > max_score: + max_score = rouge_l_score + return max_score -
    [docs] def parse_chatml_str(self, input_str): + def _parse_chatml_str(self, sample_str): user_input = None assistant_output = None qa_pairs = [] - data = json.loads(input_str) + data = json.loads(sample_str) for message in data['messages']: role = message['role'] content = message['content'] @@ -287,79 +263,91 @@

    Source code for data_juicer.ops.mapper.generate_instruction_mapper

    elif role == 'assistant': assistant_output = content qa_pairs.append((user_input, assistant_output)) - return qa_pairs
    - -
    [docs] def parse_response(self, response_str): - pattern = self.qa_extraction_pattern - matches = re.findall(pattern, response_str, re.DOTALL) - response_str = '' - out_qa_pairs = [] - for i, match in enumerate(matches): - question, answer = match - question = question.strip() - answer = answer.strip() - out_qa_pairs.append((question, answer)) - response_str += question + '\n' + answer + '\n' + return qa_pairs - if len(out_qa_pairs) == 0: - logger.error('Parse model response error! ' - 'No data generated for the current response!') +
    [docs] def build_input(self, qa_examples): - return out_qa_pairs, response_str
    - -
    [docs] def max_rouge_l_score(self, reference, candidates): + def format_qa_pairs(qa_example): + return ''.join([ + self.qa_pair_template.format(q, a) for q, a in qa_example + if q and a + ]) - r = rouge.Rouge() - max_score = 0.0 - for candidate in candidates: - scores = r.get_scores(candidate, reference) - rouge_l_score = scores[0]['rouge-l']['f'] - if rouge_l_score > max_score: - max_score = rouge_l_score - return max_score
    + formatted_examples = ''.join([ + self.example_template.format(qa_pairs=format_qa_pairs(qa_example)) + for qa_example in qa_examples + ]) + input_prompt = self.input_template.format(examples=formatted_examples) + return input_prompt
    + +
    [docs] def parse_output(self, raw_output): + logger.debug(raw_output) + output_qa_pairs = [] + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + for match in matches: + question, answer = match + output_qa_pairs.append((question.strip(), answer.strip())) + return output_qa_pairs
    -
    [docs] def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank=rank) +
    [docs] def process_single(self, sample=None, rank=None): + model, _ = get_model(self.model_key, rank, self.use_cuda()) random_qa_samples = random.sample(self.seed_qa_samples, - self.instruct_num) - input_prompt = self.build_prompt(random_qa_samples, - self.prompt_template) + self.example_num) + input_prompt = self.build_input(random_qa_samples) + + messages = [{ + 'role': 'system', + 'content': self.system_prompt + }, { + 'role': 'user', + 'content': input_prompt + }] + if self.enable_vllm: - response = model.generate([input_prompt], self.sampling_params) - response_str = response[0].outputs[0].text + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text else: - inputs = processor(input_prompt, - return_tensors='pt').to(model.device) - output_ids = model.generate(**inputs, **self.sampling_params) - # remove the input prompt from the output - output_ids = output_ids[:, inputs.data['input_ids'].shape[1]:] - response_str = processor.decode(output_ids.cpu()[0], - skip_special_tokens=True) - message_list = [] - out_qa_pairs, response_str = self.parse_response(response_str) - - if not response_str: - return {self.text_key: json.dumps({'messages': message_list})} + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] + + output_qa_pairs = self.parse_output(output) + if len(output_qa_pairs) == 0: + logger.warning('Parse model response error! ' + 'No data generated for the current response!') + sample.update({ + self.query_key: '', + self.response_key: '', + self.history_key: self.empty_history() + }) + return sample if self.similarity_type == 'rouge_l': - sim_score = self.max_rouge_l_score(response_str, - self.reference_samples) + sim_score = self._max_rouge_l_score(output_qa_pairs, + random_qa_samples) else: raise ValueError( f'Not support similarity type "{self.similarity_type}"!') if sim_score <= self.similarity_threshold: - for question, answer in out_qa_pairs: - message_list.append({'role': 'user', 'content': question}) - message_list.append({'role': 'assistant', 'content': answer}) + query, response = output_qa_pairs[-1] + history = output_qa_pairs[:-1] + if len(history) == 0: + history = self.empty_history() else: + query = response = '' + history = self.empty_history() logger.info('Filter this generated sample due to similarity.') - return { - self.text_key: - json.dumps({'messages': message_list}, ensure_ascii=False) - }
    + sample.update({ + self.query_key: query, + self.response_key: response, + self.history_key: history + }) + return sample
    diff --git a/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html new file mode 100644 index 000000000..02571066e --- /dev/null +++ b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html @@ -0,0 +1,255 @@ + + + + + + + + data_juicer.ops.mapper.generate_qa_from_text_mapper — data_juicer 0.2.0 documentation + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper

    +import re
    +from typing import Dict, Optional
    +
    +from loguru import logger
    +
    +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
    +from data_juicer.utils.lazy_loader import LazyLoader
    +from data_juicer.utils.model_utils import get_model, prepare_model
    +
    +torch = LazyLoader('torch', 'torch')
    +vllm = LazyLoader('vllm', 'vllm')
    +
    +OP_NAME = 'generate_qa_from_text_mapper'
    +
    +
    +# TODO: Extend LLM-based OPs into API-based implementation.
    +
    [docs]@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class GenerateQAFromTextMapper(Mapper): + """ + Mapper to generate question and answer pairs from text. + Recommended model list: [ + 'alibaba-pai/pai-llama3-8b-doc2qa', + 'alibaba-pai/pai-baichuan2-7b-doc2qa', + 'alibaba-pai/pai-qwen1_5-4b-doc2qa', + 'alibaba-pai/pai-qwen1_5-7b-doc2qa', + 'alibaba-pai/pai-qwen1_5-1b8-doc2qa', + 'alibaba-pai/pai-qwen1_5-0b5-doc2qa' + ] + These recommended models are all trained with Chinese data + and are suitable for Chinese. + """ + + _accelerator = 'cuda' + _batched_op = True + +
    [docs] def __init__(self, + hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', + *, + output_pattern: Optional[str] = None, + enable_vllm: bool = False, + model_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, + **kwargs): + """ + Initialization method. + + :param hf_model: Hugginface model ID. + :param output_pattern: Regular expression pattern to extract + questions and answers from model response. + :param enable_vllm: Whether to use vllm for inference acceleration. + :param model_params: Parameters for initializing the model. + :param sampling_params: Sampling parameters for text generation, + e.g {'temperature': 0.9, 'top_p': 0.95} + :param kwargs: Extra keyword arguments. + + The default data format parsed by this interface is as follows: + Model Input: + 蒙古国的首都是乌兰巴托(Ulaanbaatar) + 冰岛的首都是雷克雅未克(Reykjavik) + Model Output: + 蒙古国的首都是乌兰巴托(Ulaanbaatar) + 冰岛的首都是雷克雅未克(Reykjavik) + Human: 请问蒙古国的首都是哪里? + Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 + Human: 冰岛的首都是哪里呢? + Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 + ... + """ + + super().__init__(**kwargs) + + if output_pattern is None: + self.output_pattern = r'Human:(.*?)Assistant:(.*?)(?=Human|$)' # noqa: E501 + else: + self.output_pattern = output_pattern + + self.enable_vllm = enable_vllm + model_params = model_params or {} + sampling_params = sampling_params or {} + + if enable_vllm: + assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: + tensor_parallel_size = torch.cuda.device_count() + logger.info(f'Set tensor_parallel_size to \ + {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size + self.model_key = prepare_model( + model_type='vllm', + pretrained_model_name_or_path=hf_model, + **model_params) + self.sampling_params = vllm.SamplingParams(**sampling_params) + else: + self.model_key = prepare_model( + model_type='huggingface', + pretrained_model_name_or_path=hf_model, + return_pipe=True, + **model_params) + self.sampling_params = sampling_params
    + +
    [docs] def parse_output(self, raw_output): + logger.debug(raw_output) + qa_list = [] + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + for match in matches: + user, assistant = match + qa_list.append((user.strip(), assistant.strip())) + return qa_list
    + +
    [docs] def process_batched(self, samples, rank=None): + model, _ = get_model(self.model_key, rank, self.use_cuda()) + + input_keys = samples.keys() + num_samples = len(samples[next(iter(input_keys))]) + output_keys = input_keys | {self.query_key, self.response_key} + output_samples = {key: [] for key in output_keys} + + for i in range(num_samples): + messages = [{'role': 'user', 'content': samples[self.text_key][i]}] + + if self.enable_vllm: + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text + else: + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] + + qa_list = self.parse_output(output) + if len(qa_list) > 0: + for q, a in qa_list: + for input_k in input_keys: + output_samples[input_k].append(samples[input_k][i]) + output_samples[self.query_key].append(q) + output_samples[self.response_key].append(a) + else: + logger.warning( + 'No question and answer was extracted from current sample!' + ) + + return output_samples
    +
    + +
    +
    +
    + +
    + +
    +

    © Copyright 2024, Data-Juicer Team.

    +
    + + Built with Sphinx using a + theme + provided by Read the Docs. + + +
    +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html b/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html deleted file mode 100644 index 9404f2336..000000000 --- a/_modules/data_juicer/ops/mapper/optimize_instruction_mapper.html +++ /dev/null @@ -1,224 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.optimize_instruction_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
    - - -
    - -
    -
    -
    - -
    -
    -
    -
    - -

    Source code for data_juicer.ops.mapper.optimize_instruction_mapper

    -from typing import Dict, Optional
    -
    -from loguru import logger
    -
    -from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
    -from data_juicer.utils.lazy_loader import LazyLoader
    -from data_juicer.utils.model_utils import get_model, prepare_model
    -
    -torch = LazyLoader('torch', 'torch')
    -vllm = LazyLoader('vllm', 'vllm')
    -
    -DEFAULT_SYSTEM_PROMPT = '请优化这个指令,将其修改为一个更详细具体的指令。'
    -
    -OP_NAME = 'optimize_instruction_mapper'
    -
    -
    -# TODO: Extend LLM-based OPs into API-based implementation.
    -
    [docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class OptimizeInstructionMapper(Mapper): - """Mapper to optimize instruction. - Recommended model list: [ - alibaba-pai/Qwen2-1.5B-Instruct-Refine - alibaba-pai/Qwen2-7B-Instruct-Refine - ] - """ - _accelerator = 'cuda' - -
    [docs] def __init__(self, - hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine', - trust_remote_code: bool = False, - system_prompt: Optional[str] = None, - enable_vllm: bool = True, - tensor_parallel_size: Optional[int] = None, - max_model_len: Optional[int] = None, - max_num_seqs: int = 256, - sampling_params: Dict = {}, - *args, - **kwargs): - """ - Initialization method. - :param hf_model: Hugginface model id. - :param trust_remote_code: passed to transformers - :param system_prompt: System prompt for optimize samples. - :param enable_vllm: Whether to use vllm for inference acceleration. - :param tensor_parallel_size: It is only valid when enable_vllm is True. - The number of GPUs to use for distributed execution with tensor - parallelism. - :param max_model_len: It is only valid when enable_vllm is True. - Model context length. If unspecified, will be automatically - derived from the model config. - :param max_num_seqs: It is only valid when enable_vllm is True. - Maximum number of sequences to be processed in a single iteration. - :param sampling_params: Sampling parameters for text generation. - e.g {'temperature': 0.9, 'top_p': 0.95} - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.num_proc = 1 - - if system_prompt is None: - system_prompt = DEFAULT_SYSTEM_PROMPT - self.system_prompt = system_prompt - self.enable_vllm = enable_vllm - - if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - if not tensor_parallel_size: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') - self.model_key = prepare_model( - model_type='vllm', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code, - tensor_parallel_size=tensor_parallel_size, - max_model_len=max_model_len, - max_num_seqs=max_num_seqs) - self.sampling_params = vllm.SamplingParams(**sampling_params) - else: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_model, - trust_remote_code=trust_remote_code) - self.sampling_params = sampling_params
    - -
    [docs] def process_single(self, sample=None, rank=None): - model, processor = get_model(self.model_key, rank=rank) - - messages = [{ - 'role': 'system', - 'content': self.system_prompt - }, { - 'role': 'user', - 'content': sample[self.text_key] - }] - input_prompt = processor.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True) - - if self.enable_vllm: - response = model.generate([input_prompt], self.sampling_params) - output = response[0].outputs[0].text - else: - inputs = processor(input_prompt, - return_tensors='pt').to(model.device) - response = model.generate(**inputs, - eos_token_id=processor.eos_token_id, - **self.sampling_params) - output = processor.decode(response.cpu()[0], - skip_special_tokens=True) - - sample[self.text_key] = output - - return sample
    -
    - -
    -
    -
    - -
    - -
    -

    © Copyright 2024, Data-Juicer Team.

    -
    - - Built with Sphinx using a - theme - provided by Read the Docs. - - -
    -
    -
    -
    -
    - - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html new file mode 100644 index 000000000..a040d8438 --- /dev/null +++ b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html @@ -0,0 +1,254 @@ + + + + + + + + data_juicer.ops.mapper.optimize_qa_mapper — data_juicer 0.2.0 documentation + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Source code for data_juicer.ops.mapper.optimize_qa_mapper

    +import re
    +from typing import Dict, Optional
    +
    +from loguru import logger
    +
    +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
    +from data_juicer.utils.lazy_loader import LazyLoader
    +from data_juicer.utils.model_utils import get_model, prepare_model
    +
    +torch = LazyLoader('torch', 'torch')
    +vllm = LazyLoader('vllm', 'vllm')
    +
    +OP_NAME = 'optimize_qa_mapper'
    +
    +
    +# TODO: Extend LLM-based OPs into API-based implementation.
    +
    [docs]@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class OptimizeQAMapper(Mapper): + """ + Mapper to optimize question-answer pairs. + """ + + # avoid leading whitespace + DEFAULT_SYSTEM_PROMPT = ('请优化输入的问答对,使【问题】和【回答】都更加详细、准确。' + '必须按照以下标记格式,直接输出优化后的问答对:\n' + '【问题】\n' + '优化后的问题\n' + '【回答】\n' + '优化后的回答') + DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}' + DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}' + DEFAULT_OUTPUT_PATTERN = r'.*?【问题】\s*(.*?)\s*【回答】\s*(.*)' + + _accelerator = 'cuda' + +
    [docs] def __init__(self, + hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', + *, + system_prompt: Optional[str] = None, + input_template: Optional[str] = None, + qa_pair_template: Optional[str] = None, + output_pattern: Optional[str] = None, + enable_vllm: bool = False, + model_params: Optional[Dict] = None, + sampling_params: Optional[Dict] = None, + **kwargs): + """ + Initialization method. + + :param hf_model: Hugging Face model ID. + :param system_prompt: System prompt for guiding the optimization task. + :param input_template: Template for building the input for the model. + Please make sure the template contains one placeholder '{}', which + corresponds to the question and answer pair generated by + param `qa_pair_template`. + :param qa_pair_template: Template for formatting the question and + answer pair. Please make sure the template contains two + '{}' to format question and answer. + :param output_pattern: Regular expression pattern to extract question + and answer from model response. + :param enable_vllm: Whether to use VLLM for inference acceleration. + :param model_params: Parameters for initializing the model. + :param sampling_params: Sampling parameters for text generation (e.g., + {'temperature': 0.9, 'top_p': 0.95}). + :param kwargs: Extra keyword arguments. + """ + super().__init__(**kwargs) + + self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT + self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE + self.qa_pair_template = qa_pair_template or \ + self.DEFAULT_QA_PAIR_TEMPLATE + self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN + + self.enable_vllm = enable_vllm + model_params = model_params or {} + sampling_params = sampling_params or {} + + if enable_vllm: + assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' + # cannot initialize vllm replicas on different GPUs + self.num_proc = 1 + if model_params.get('tensor_parallel_size') is None: + tensor_parallel_size = torch.cuda.device_count() + logger.info(f'Set tensor_parallel_size to \ + {tensor_parallel_size} for vllm.') + model_params['tensor_parallel_size'] = tensor_parallel_size + self.model_key = prepare_model( + model_type='vllm', + pretrained_model_name_or_path=hf_model, + **model_params) + self.sampling_params = vllm.SamplingParams(**sampling_params) + else: + self.model_key = prepare_model( + model_type='huggingface', + pretrained_model_name_or_path=hf_model, + return_pipe=True, + **model_params) + self.sampling_params = sampling_params
    + +
    [docs] def build_input(self, sample): + qa_pair = self.qa_pair_template.format(sample[self.query_key], + sample[self.response_key]) + input_prompt = self.input_template.format(qa_pair) + return input_prompt
    + +
    [docs] def parse_output(self, raw_output): + logger.debug(raw_output) + matches = re.findall(self.output_pattern, raw_output, re.DOTALL) + if matches: + match = matches[0] + return match.group(1).strip(), match.group(2).strip() + else: + return None, None
    + +
    [docs] def process_single(self, sample=None, rank=None): + model, _ = get_model(self.model_key, rank, self.use_cuda()) + + input_prompt = self.build_input(sample) + messages = [{ + 'role': 'system', + 'content': self.system_prompt + }, { + 'role': 'user', + 'content': input_prompt + }] + + if self.enable_vllm: + response = model.chat(messages, self.sampling_params) + output = response[0].outputs[0].text + else: + # model is pipe + response = model(messages, + return_full_text=False, + **self.sampling_params) + output = response[0]['generated_text'] + + parsed_q, parsed_a = self.parse_output(output) + if parsed_q: + sample[self.query_key] = parsed_q + if parsed_a: + sample[self.response_key] = parsed_a + + return sample
    +
    + +
    +
    +
    + +
    + +
    +

    © Copyright 2024, Data-Juicer Team.

    +
    + + Built with Sphinx using a + theme + provided by Read the Docs. + + +
    +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_query_mapper.html b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html new file mode 100644 index 000000000..ab704fc5a --- /dev/null +++ b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html @@ -0,0 +1,129 @@ + + + + + + + + data_juicer.ops.mapper.optimize_query_mapper — data_juicer 0.2.0 documentation + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Source code for data_juicer.ops.mapper.optimize_query_mapper

    +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
    +from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
    +
    +OP_NAME = 'optimize_query_mapper'
    +
    +
    +# TODO: Extend LLM-based OPs into API-based implementation.
    +
    [docs]@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class OptimizeQueryMapper(OptimizeQAMapper): + """ + Mapper to optimize query in question-answer pairs. + """ + + DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。' # noqa: E501 + + _accelerator = 'cuda' + +
    [docs] def parse_output(self, raw_output): + return raw_output.strip(), None
    +
    + +
    +
    +
    + +
    + +
    +

    © Copyright 2024, Data-Juicer Team.

    +
    + + Built with Sphinx using a + theme + provided by Read the Docs. + + +
    +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_response_mapper.html b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html new file mode 100644 index 000000000..22993ce81 --- /dev/null +++ b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html @@ -0,0 +1,129 @@ + + + + + + + + data_juicer.ops.mapper.optimize_response_mapper — data_juicer 0.2.0 documentation + + + + + + + + + + + + + +
    + + +
    + +
    +
    +
    + +
    +
    +
    +
    + +

    Source code for data_juicer.ops.mapper.optimize_response_mapper

    +from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
    +from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
    +
    +OP_NAME = 'optimize_response_mapper'
    +
    +
    +# TODO: Extend LLM-based OPs into API-based implementation.
    +
    [docs]@UNFORKABLE.register_module(OP_NAME) +@OPERATORS.register_module(OP_NAME) +class OptimizeResponseMapper(OptimizeQAMapper): + """ + Mapper to optimize response in question-answer pairs. + """ + + DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。' + + _accelerator = 'cuda' + +
    [docs] def parse_output(self, raw_output): + return None, raw_output.strip()
    +
    + +
    +
    +
    + +
    + +
    +

    © Copyright 2024, Data-Juicer Team.

    +
    + + Built with Sphinx using a + theme + provided by Read the Docs. + + +
    +
    +
    +
    +
    + + + + \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html index a553f9a40..7df789de5 100644 --- a/_modules/index.html +++ b/_modules/index.html @@ -159,9 +159,9 @@

    All modules for which code is available

  • data_juicer.ops.mapper.clean_ip_mapper
  • data_juicer.ops.mapper.clean_links_mapper
  • data_juicer.ops.mapper.expand_macro_mapper
  • -
  • data_juicer.ops.mapper.extract_qa_mapper
  • data_juicer.ops.mapper.fix_unicode_mapper
  • -
  • data_juicer.ops.mapper.generate_instruction_mapper
  • +
  • data_juicer.ops.mapper.generate_qa_from_examples_mapper
  • +
  • data_juicer.ops.mapper.generate_qa_from_text_mapper
  • data_juicer.ops.mapper.image_blur_mapper
  • data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper
  • data_juicer.ops.mapper.image_captioning_mapper
  • @@ -170,7 +170,9 @@

    All modules for which code is available

  • data_juicer.ops.mapper.image_tagging_mapper
  • data_juicer.ops.mapper.nlpaug_en_mapper
  • data_juicer.ops.mapper.nlpcda_zh_mapper
  • -
  • data_juicer.ops.mapper.optimize_instruction_mapper
  • +
  • data_juicer.ops.mapper.optimize_qa_mapper
  • +
  • data_juicer.ops.mapper.optimize_query_mapper
  • +
  • data_juicer.ops.mapper.optimize_response_mapper
  • data_juicer.ops.mapper.punctuation_normalization_mapper
  • data_juicer.ops.mapper.remove_bibliography_mapper
  • data_juicer.ops.mapper.remove_comments_mapper
  • diff --git a/data_juicer.ops.deduplicator.html b/data_juicer.ops.deduplicator.html index b3126bfce..34c7a68be 100644 --- a/data_juicer.ops.deduplicator.html +++ b/data_juicer.ops.deduplicator.html @@ -47,15 +47,15 @@
  • data_juicer.ops.filter
  • data_juicer.ops.mapper
  • data_juicer.ops.deduplicator
  • data_juicer.ops.selector
  • @@ -92,44 +92,45 @@

    data_juicer.ops.deduplicator

    -
    -class data_juicer.ops.deduplicator.VideoDeduplicator(consider_text: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.deduplicator.DocumentDeduplicator(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]

    Bases: Deduplicator

    -

    Deduplicator to deduplicate samples at document-level using exact matching -of videos between documents.

    +

    Deduplicator to deduplicate samples at document-level using exact matching.

    +

    Using md5 hash to deduplicate samples.

    -
    -__init__(consider_text: bool = False, *args, **kwargs)[source]
    -

    Initialization.

    +
    +__init__(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +

    Initialization method.

    Parameters:
      -
    • consider_text – whether to consider text hash together with video -hash when applying deduplication.

    • +
    • lowercase – Whether to convert sample text to lower case

    • +
    • ignore_non_character – Whether to ignore non-alphabet +characters, including whitespaces, digits, and punctuations

    • args – extra args

    • -
    • kwargs – extra args

    • +
    • kwargs – extra args.

    -
    -compute_hash(sample, context=False)[source]
    -

    Compute hash values for the sample.

    +
    +compute_hash(sample)[source]
    +

    Compute md5 hash values for the sample.

    Parameters:

    sample – input sample

    Returns:
    -

    sample with computed hash value.

    +

    sample with md5 hash value.

    -
    -process(dataset, show_num=0)[source]
    +
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -147,69 +148,6 @@
    -
    -
    -class data_juicer.ops.deduplicator.RayBasicDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    -

    Bases: Filter

    -

    A basic exact matching deduplicator for RAY. -Although its functionality is deduplication, -it is implemented as Filter sub-class.

    -
    -
    -EMPTY_HASH_VALUE = 'EMPTY'
    -
    - -
    -
    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    -

    Initialization. -:param redis_host: the hostname of redis server -:param redis_port: the port of redis server -:param args: extra args -:param kwargs: extra args

    -
    - -
    -
    -calculate_hash(sample, context=False)[source]
    -

    Calculate hash value for the sample.

    -
    - -
    -
    -compute_stats_single(sample, context=False)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    - -
    -
    -process_single(sample)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    - -
    -
    class data_juicer.ops.deduplicator.DocumentMinhashDeduplicator(tokenization: str = 'space', window_size: int[int] = 5, lowercase: bool = True, ignore_pattern: str | None = None, num_permutations: int[int] = 256, jaccard_threshold: float[float] = 0.7, num_bands: int[int] | None = None, num_rows_per_band: int[int] | None = None, tokenizer_model: str | None = None, *args, **kwargs)[source]
    @@ -291,95 +229,54 @@
    -
    -class data_juicer.ops.deduplicator.RayImageDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    -

    Bases: RayBasicDeduplicator

    -

    Deduplicator to deduplicate samples at document-level using exact matching -of images between documents.

    -
    -
    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    -

    Initialization. -:param redis_host: the hostname of redis server -:param redis_port: the port of redis server -:param args: extra args -:param kwargs: extra args

    -
    - -
    -
    -calculate_hash(sample, context=False)[source]
    -

    Calculate hash value for the sample.

    -
    - -
    - -
    -
    -class data_juicer.ops.deduplicator.RayDocumentDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    -

    Bases: RayBasicDeduplicator

    -

    Deduplicator to deduplicate samples at document-level using exact matching.

    -
    -
    -__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    -

    Initialization method. -:param redis_host: the hostname of redis server -:param redis_port: the port of redis server -:param lowercase: Whether to convert sample text to lower case -:param ignore_non_character: Whether to ignore non-alphabet -characters, including whitespaces, digits, and punctuations -:param args: extra args -:param kwargs: extra args.

    -
    - -
    -
    -calculate_hash(sample, context=False)[source]
    -

    Calculate hash value for the sample.

    -
    - -
    - -
    -
    -class data_juicer.ops.deduplicator.DocumentDeduplicator(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.deduplicator.DocumentSimhashDeduplicator(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]

    Bases: Deduplicator

    -

    Deduplicator to deduplicate samples at document-level using exact matching.

    -

    Using md5 hash to deduplicate samples.

    +

    Deduplicator to deduplicate samples at document-level using SimHash.

    -
    -__init__(lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    -

    Initialization method.

    +
    +__init__(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]
    +

    Initialization method :param tokenization: tokenization method for +sample texts.

    +

    It should be one of [space, punctuation, character]. For +English-like languages, we recommend to use ‘space’. And for +Chinese-like languages, we recommend to use ‘character’

    Parameters:
      -
    • lowercase – Whether to convert sample text to lower case

    • -
    • ignore_non_character – Whether to ignore non-alphabet -characters, including whitespaces, digits, and punctuations

    • -
    • args – extra args

    • -
    • kwargs – extra args.

    • +
    • window_size – window size of shingling

    • +
    • lowercase – whether to convert text to lower case first

    • +
    • ignore_pattern – whether to ignore sub-strings with +specific pattern when computing simhash

    • +
    • num_blocks – number of blocks in simhash computing

    • +
    • hamming_distance – the max hamming distance threshold in +near-duplicate detection. When the hamming distance of two +sample texts is <= this threshold, they are regarded as +similar samples and this op will only keep one of them after +deduplication. This threshold should be always less than +num_blocks

    -
    -compute_hash(sample)[source]
    -

    Compute md5 hash values for the sample.

    +
    +compute_hash(sample)[source]
    +

    Compute simhash values for the sample.

    Parameters:

    sample – input sample

    Returns:
    -

    sample with md5 hash value.

    +

    sample with simhash value.

    -
    -process(dataset, show_num=0)[source]
    +
    +process(dataset, show_num=0)[source]

    For doc-level, dataset –> dataset.

    Parameters:
    @@ -455,71 +352,118 @@
    -
    -class data_juicer.ops.deduplicator.DocumentSimhashDeduplicator(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]
    -

    Bases: Deduplicator

    -

    Deduplicator to deduplicate samples at document-level using SimHash.

    +
    +class data_juicer.ops.deduplicator.RayBasicDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    +

    Bases: Filter

    +

    A basic exact matching deduplicator for RAY. +Although its functionality is deduplication, +it is implemented as Filter sub-class.

    +
    +
    +EMPTY_HASH_VALUE = 'EMPTY'
    +
    +
    -
    -__init__(tokenization: str = 'space', window_size: int[int] = 6, lowercase: bool = True, ignore_pattern: str | None = None, num_blocks: int[int] = 6, hamming_distance: int[int] = 4, *args, **kwargs)[source]
    -

    Initialization method :param tokenization: tokenization method for -sample texts.

    -

    It should be one of [space, punctuation, character]. For -English-like languages, we recommend to use ‘space’. And for -Chinese-like languages, we recommend to use ‘character’

    -
    -
    Parameters:
    -
      -
    • window_size – window size of shingling

    • -
    • lowercase – whether to convert text to lower case first

    • -
    • ignore_pattern – whether to ignore sub-strings with -specific pattern when computing simhash

    • -
    • num_blocks – number of blocks in simhash computing

    • -
    • hamming_distance – the max hamming distance threshold in -near-duplicate detection. When the hamming distance of two -sample texts is <= this threshold, they are regarded as -similar samples and this op will only keep one of them after -deduplication. This threshold should be always less than -num_blocks

    • -
    -
    -
    +
    +__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    +

    Initialization. +:param redis_host: the hostname of redis server +:param redis_port: the port of redis server +:param args: extra args +:param kwargs: extra args

    -
    -compute_hash(sample)[source]
    -

    Compute simhash values for the sample.

    +
    +calculate_hash(sample, context=False)[source]
    +

    Calculate hash value for the sample.

    +
    + +
    +
    +compute_stats_single(sample, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    Parameters:
    -

    sample – input sample

    +
      +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    • +
    Returns:
    -

    sample with simhash value.

    +

    sample with computed stats

    -
    -process(dataset, show_num=0)[source]
    -

    For doc-level, dataset –> dataset.

    +
    +process_single(sample)[source]
    +

    For sample level, sample –> Boolean.

    Parameters:
    -
      -
    • dataset – input dataset

    • -
    • show_num – number of traced samples used when tracer is -open.

    • -
    +

    sample – sample to decide whether to filter

    Returns:
    -

    deduplicated dataset and the sampled duplicate pairs.

    +

    true for keeping and false for filtering

    +
    +
    +class data_juicer.ops.deduplicator.RayDocumentDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +

    Bases: RayBasicDeduplicator

    +

    Deduplicator to deduplicate samples at document-level using exact matching.

    +
    +
    +__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, lowercase: bool = False, ignore_non_character: bool = False, *args, **kwargs)[source]
    +

    Initialization method. +:param redis_host: the hostname of redis server +:param redis_port: the port of redis server +:param lowercase: Whether to convert sample text to lower case +:param ignore_non_character: Whether to ignore non-alphabet +characters, including whitespaces, digits, and punctuations +:param args: extra args +:param kwargs: extra args.

    +
    + +
    +
    +calculate_hash(sample, context=False)[source]
    +

    Calculate hash value for the sample.

    +
    + +
    + +
    +
    +class data_juicer.ops.deduplicator.RayImageDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    +

    Bases: RayBasicDeduplicator

    +

    Deduplicator to deduplicate samples at document-level using exact matching +of images between documents.

    +
    +
    +__init__(redis_host: str = 'localhost', redis_port: int[int] = 6380, method: str = 'phash', *args, **kwargs)[source]
    +

    Initialization. +:param redis_host: the hostname of redis server +:param redis_port: the port of redis server +:param args: extra args +:param kwargs: extra args

    +
    + +
    +
    +calculate_hash(sample, context=False)[source]
    +

    Calculate hash value for the sample.

    +
    + +
    +
    class data_juicer.ops.deduplicator.RayVideoDeduplicator(redis_host: str = 'localhost', redis_port: int[int] = 6380, *args, **kwargs)[source]
    @@ -544,6 +488,62 @@
    +
    +
    +class data_juicer.ops.deduplicator.VideoDeduplicator(consider_text: bool = False, *args, **kwargs)[source]
    +

    Bases: Deduplicator

    +

    Deduplicator to deduplicate samples at document-level using exact matching +of videos between documents.

    +
    +
    +__init__(consider_text: bool = False, *args, **kwargs)[source]
    +

    Initialization.

    +
    +
    Parameters:
    +
      +
    • consider_text – whether to consider text hash together with video +hash when applying deduplication.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +compute_hash(sample, context=False)[source]
    +

    Compute hash values for the sample.

    +
    +
    Parameters:
    +

    sample – input sample

    +
    +
    Returns:
    +

    sample with computed hash value.

    +
    +
    +
    + +
    +
    +process(dataset, show_num=0)[source]
    +

    For doc-level, dataset –> dataset.

    +
    +
    Parameters:
    +
      +
    • dataset – input dataset

    • +
    • show_num – number of traced samples used when tracer is +open.

    • +
    +
    +
    Returns:
    +

    deduplicated dataset and the sampled duplicate pairs.

    +
    +
    +
    + +
    +
    diff --git a/data_juicer.ops.filter.html b/data_juicer.ops.filter.html index 81afd97db..b9c1f1dd0 100644 --- a/data_juicer.ops.filter.html +++ b/data_juicer.ops.filter.html @@ -45,49 +45,49 @@
  • data_juicer.core
  • data_juicer.ops
  • data_juicer.ops.filter
  • data_juicer.ops.mapper
  • @@ -126,33 +126,28 @@

    data_juicer.ops.filter

    -
    -class data_juicer.ops.filter.ImageTextSimilarityFilter(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AlphanumericFilter(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples those similarities between image and text -within a specific range.

    +

    Filter to keep samples with alphabet/numeric ratio within a specific +range.

    -
    -__init__(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +__init__(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_clip – clip model name on huggingface to compute -the similarity between image and text.

    • -
    • min_score – The min similarity to keep samples.

    • -
    • max_score – The max similarity to keep samples.

    • -
    • horizontal_flip – Flip image horizontally (left to right).

    • -
    • vertical_flip – Flip image vertically (top to bottom).

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    • -
    • reduce_mode – reduce mode when one text corresponds to -multiple images in a chunk. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • +
    • tokenization – Whether to count the ratio of alphanumeric +to the total number of tokens. if tokenization=False, it +will count the ratio of alphanumeric to the total number of +characters.

    • +
    • min_ratio – The min filter ratio in alphanumeric op, +samples will be filtered if their alphabet/numeric ratio is +below this parameter.

    • +
    • max_ratio – The max filter ratio in alphanumeric op, +samples will be filtered if their alphabet/numeric ratio +exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    @@ -161,60 +156,36 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    +
    +compute_stats_batched(samples)[source]
    +
    -
    -process_single(sample, rank=None)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    +
    +process_batched(samples)[source]
    +
    -
    -class data_juicer.ops.filter.VideoAspectRatioFilter(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AudioDurationFilter(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with video aspect ratio within a specific range. -AspectRatio = W / H.

    +

    Keep data samples whose audios’ durations are within a specified range.

    -
    -__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_ratio – The minimum aspect ratio to keep samples, -supported format is a string, such as “9:21” or “9/21”.

    • -
    • max_ratio – The maximum aspect ratio to keep samples, -supported format is a string, such as “21:9” or “21/9”.

    • +
    • min_duration – The min audio duration to keep samples in seconds. +It’s 0 by default.

    • +
    • max_duration – The max audio duration to keep samples in seconds. +It’s sys.maxsize by default.

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the +all audios. ‘any’: keep this sample if any audios meet the +condition. ‘all’: keep this sample only if all audios meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -224,8 +195,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -243,8 +214,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -259,33 +230,28 @@
    -
    -class data_juicer.ops.filter.ImageTextMatchingFilter(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AudioNMFSNRFilter(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples those matching score between image and text -within a specific range.

    +

    Keep data samples whose audios’ SNRs (computed based on NMF) are within +a specified range.

    -
    -__init__(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +__init__(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_blip – blip model name on huggingface to compute -the matching score between image and text.

    • -
    • min_score – The min matching score to keep samples.

    • -
    • max_score – The max matching score to keep samples.

    • -
    • horizontal_flip – Flip image horizontally (left to right).

    • -
    • vertical_flip – Flip image vertically (top to bottom).

    • +
    • min_snr – The min audio SNR to keep samples in dB. It’s 0 by +default.

    • +
    • max_snr – The max audio SNR to keep samples in dB. It’s +sys.maxsize by default.

    • +
    • nmf_iter_num – The max number of iterations to run NMF. It’s 500 +in default.

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +all audios. ‘any’: keep this sample if any audios meet the +condition. ‘all’: keep this sample only if all audios meet the condition.

    • -
    • reduce_mode – reduce mode when one text corresponds to -multiple images in a chunk. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • args – extra args

    • kwargs – extra args

    @@ -294,8 +260,8 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -313,8 +279,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -329,24 +295,25 @@
    -
    -class data_juicer.ops.filter.ImageNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AudioSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose images have low nsfw scores.

    +

    Keep data samples whose audio size (in bytes/kb/MB/…) within a +specific range.

    -
    -__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_nsfw_model – nsfw detection model name on huggingface.

    • -
    • score_threshold – the nsfw score threshold for samples. -range from 0 to 1. Samples with nsfw score less than this threshold -will be kept.

    • +
    • min_size – The min audio size to keep samples. set to be “0” by +default for no size constraint

    • +
    • max_size – The max audio size to keep samples. set to be +“1Tb” by default, an approximate for un-limited case

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +all audios. ‘any’: keep this sample if any audios meet the +condition. ‘all’: keep this sample only if all audios meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -356,8 +323,8 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -375,8 +342,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -391,24 +358,23 @@
    -
    -class data_juicer.ops.filter.TokenNumFilter(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.AverageLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with total token number within a specific +

    Filter to keep samples with average line length within a specific range.

    -
    -__init__(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_tokenizer – the tokenizer name of Hugging Face tokenizers.

    • -
    • min_num – The min filter token number in this op, samples -will be filtered if their token number is below this +

    • min_len – The min filter length in this op, samples will +be filtered if their average line length is below this parameter.

    • -
    • max_num – The max filter token number in this op, samples -will be filtered if their token number exceeds this +

    • max_len – The max filter length in this op, samples will +be filtered if their average line length exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    • @@ -418,59 +384,37 @@
    -
    -compute_stats_single(sample)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    +
    +compute_stats_batched(samples, context=False)[source]
    +
    -
    -process_single(sample)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    +
    +process_batched(samples)[source]
    +
    -
    -class data_juicer.ops.filter.TextLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.CharacterRepetitionFilter(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with total text length within a specific -range.

    +

    Filter to keep samples with char-level n-gram repetition ratio within a +specific range.

    -
    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_len – The min text length in the filtering. samples -will be filtered if their text length is below this -parameter.

    • -
    • max_len – The max text length in the filtering. samples -will be filtered if their text length exceeds this -parameter.

    • +
    • rep_len – Repetition length for char-level n-gram.

    • +
    • min_ratio – The min filter ratio in this op, samples will +be filtered if their char-level n-gram repetition ratio is +below this parameter.

    • +
    • max_ratio – The max filter ratio in this op, samples will +be filtered if their char-level n-gram repetition ratio +exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    @@ -479,41 +423,43 @@
    -
    -compute_stats_batched(samples)[source]
    +
    +compute_stats_batched(samples)[source]
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.filter.SpecifiedNumericFieldFilter(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.FlaggedWordFilter(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter based on specified numeric field information.

    -

    If the specified numeric information in the sample is not within the -specified range, the sample will be filtered.

    +

    Filter to keep samples with flagged-word ratio less than a specific max +value.

    -
    -__init__(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • field_key – Filter based on the specified numeric value -corresponding to the target key. The target key -corresponding to multi-level field information need to be -separated by ‘.’.

    • -
    • min_value – The min filter value in SpecifiedNumericField -op, samples will be filtered if their specified numeric -field value is below this parameter.

    • -
    • max_value – The max filter value in SpecifiedNumericField -op, samples will be filtered if their specified numeric -field value exceeds this parameter.

    • +
    • lang – Consider flagged words in what language. If lang == +“all”, we will adopt the one merged from all the available +languages

    • +
    • tokenization – Whether to use model to tokenize documents

    • +
    • max_ratio – The max filter ratio in this op.

    • +
    • flagged_words_dir – The directory storing the +flagged_words file(s) whose name includes “flagged_words” +and in json format

    • +
    • use_words_aug – Whether to augment words, especially for +Chinese and Vietnamese

    • +
    • words_aug_group_sizes – The group size of words to augment

    • +
    • words_aug_join_char – The join char between words to +augment

    • args – extra args

    • kwargs – extra args

    @@ -522,8 +468,8 @@
    -
    -compute_stats_single(sample)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -541,8 +487,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -557,38 +503,37 @@
    -
    -class data_juicer.ops.filter.AudioNMFSNRFilter(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose audios’ SNRs (computed based on NMF) are within -a specified range.

    +

    Filter to keep samples with aesthetics scores within a specific range.

    -
    -__init__(min_snr: float = 0, max_snr: float = 9223372036854775807, nmf_iter_num: int[int] = 500, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_snr – The min audio SNR to keep samples in dB. It’s 0 by -default.

    • -
    • max_snr – The max audio SNR to keep samples in dB. It’s -sys.maxsize by default.

    • -
    • nmf_iter_num – The max number of iterations to run NMF. It’s 500 -in default.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all audios. ‘any’: keep this sample if any audios meet the -condition. ‘all’: keep this sample only if all audios meet the +

    • hf_scorer_model – Huggingface model name for the aesthetics +predictor. By default, we will use +‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’, +refer to pypi.org/project/simple-aesthetics-predictor

    • +
    • min_score – Min score for the predicted aesthetics in an image.

    • +
    • max_score – Max score for the predicted aesthetics in an image.

    • +
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • args – Extra positional arguments.

    • +
    • kwargs – Extra keyword arguments.

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -606,8 +551,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -622,57 +567,34 @@
    -
    -class data_juicer.ops.filter.VideoAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageAspectRatioFilter(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep data samples with aesthetics scores for specified frames -in the videos within a specific range.

    +

    Filter to keep samples with image aspect ratio within a specific range. +AspectRatio = W / H.

    -
    -__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +__init__(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_scorer_model – Huggingface model name for the aesthetics -predictor. By default, we will use -‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’, -refer to pypi.org/project/simple-aesthetics-predictor

    • -
    • min_score – Min score for the predicted aesthetics in a video.

    • -
    • max_score – Max score for the predicted aesthetics in a video.

    • -
    • frame_sampling_method – sampling method of extracting frame -images from the videos. -Should be one of [“all_keyframes”, “uniform”]. -The former one extracts all key frames and the latter one extract -specified number of frames uniformly from the video. -Default: “uniform” with frame_num=3, considering that the number of -keyframes can be large while their difference is usually small -in terms of their aesthetics.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

    • -
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the +

    • min_ratio – The min aspect ratio to keep samples.

    • +
    • max_ratio – The max aspect ratio to keep samples.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • -
    • reduce_mode – reduce mode when one sample corresponds to -multiple frames, must be one of [‘avg’,’max’, ‘min’]. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • -
    • args – Extra positional arguments.

    • -
    • kwargs – Extra keyword arguments.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -690,8 +612,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -706,89 +628,97 @@
    -
    -class data_juicer.ops.filter.PerplexityFilter(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageFaceCountFilter(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with perplexity score less than a specific max -value.

    +

    Filter to keep samples with the number of faces within a specific range.

    -
    -__init__(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +
    +__init__(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – Compute perplexity for samples in which language.

    • -
    • max_ppl – The max filter perplexity in this op, samples -will be filtered if their perplexity exceeds this parameter.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • cv_classifier – OpenCV classifier path for face detection. +By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • +
    • min_face_count – Minimum number of faces required for samples.

    • +
    • max_face_count – Maximum number of faces required for samples.

    • +
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • +
    • args – Extra positional arguments.

    • +
    • kwargs – Extra keyword arguments.

    -
    -compute_stats_batched(samples, context=False)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    - -
    - -
    -
    -class data_juicer.ops.filter.PhraseGroundingRecallFilter(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +
    +compute_stats_single(sample, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    +
    +
    Parameters:
    +
      +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    • +
    +
    +
    Returns:
    +

    sample with computed stats

    +
    +
    +
    + +
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> Boolean.

    +
    +
    Parameters:
    +

    sample – sample to decide whether to filter

    +
    +
    Returns:
    +

    true for keeping and false for filtering

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.filter.ImageFaceRatioFilter(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose locating recalls of phrases extracted -from text in the images are within a specified range.

    +

    Filter to keep samples with face area ratios within a specific range.

    -
    -__init__(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +
    +__init__(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_owlvit – Owl-ViT model name on huggingface to locate the -phrases extracted from the text.

    • -
    • min_recall – The min phrase grounding recall to keep samples.

    • -
    • max_recall – The max phrase grounding recall to keep samples.

    • -
    • horizontal_flip – Flip image horizontally (left to right).

    • -
    • vertical_flip – Flip image vertically (top to bottom).

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +

    • cv_classifier – OpenCV classifier path for face detection. +By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • +
    • min_ratio – Min ratio for the largest face area in an image.

    • +
    • max_ratio – Max ratio for the largest face area in an image.

    • +
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of all images. ‘any’: keep this sample if any images meet the condition. ‘all’: keep this sample only if all images meet the condition.

    • -
    • reduce_mode – reduce mode when one text corresponds to -multiple images in a chunk. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • -
    • iou_thr – the IoU threshold for NMS-like post-process. If two -predicted bboxes are overlap with an IoU larger than this -threshold, the bbox with less confidence will be removed. Default: -0.5.

    • -
    • large_area_ratio_thr – the area ratio threshold for filtering out -those large predicted bboxes. If the area of a predicted bbox -accounts for more than this ratio threshold of the whole image -area, this bbox will be removed. Default: 0.95.

    • -
    • conf_thr – the confidence score threshold for removing -low-confidence bboxes. If the confidence score of a predicted bbox -is lower than the threshold, this bbox will be removed. Default: 0.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • args – Extra positional arguments.

    • +
    • kwargs – Extra keyword arguments.

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -806,8 +736,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -822,24 +752,25 @@
    -
    -class data_juicer.ops.filter.MaximumLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with maximum line length within a specific -range.

    +

    Filter to keep samples whose images have low nsfw scores.

    -
    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_len – The min filter length in this op, samples will -be filtered if their maximum line length is below this -parameter.

    • -
    • max_len – The max filter length in this op, samples will -be filtered if their maximum line length exceeds this -parameter.

    • +
    • hf_nsfw_model – nsfw detection model name on huggingface.

    • +
    • score_threshold – the nsfw score threshold for samples. +range from 0 to 1. Samples with nsfw score less than this threshold +will be kept.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -848,75 +779,133 @@
    -
    -compute_stats_batched(samples, context=False)[source]
    -
    +
    +compute_stats_single(sample, rank=None, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    +
    +
    Parameters:
    +
      +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    • +
    +
    +
    Returns:
    +

    sample with computed stats

    +
    +
    +
    -
    -process_batched(samples)[source]
    -
    +
    +process_single(sample, rank=None)[source]
    +

    For sample level, sample –> Boolean.

    +
    +
    Parameters:
    +

    sample – sample to decide whether to filter

    +
    +
    Returns:
    +

    true for keeping and false for filtering

    +
    +
    +
    -
    -class data_juicer.ops.filter.AverageLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImagePairSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with average line length within a specific -range.

    +

    Filter to keep image pairs with similarities between images +within a specific range.

    -
    -__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    +
    +
    +
    param hf_clip:
    +

    clip model name on huggingface to compute +the similarity between image and text.

    +
    +
    param min_score:
    +

    The min similarity to keep samples.

    +
    +
    param max_score:
    +

    The max similarity to keep samples.

    +
    +
    param any_or_all:
    +

    keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    +
    +
    param args:
    +

    extra args

    +
    +
    param kwargs:
    +

    extra args

    +
    +
    +
    +
    + +
    +
    +compute_stats_single(sample, rank=None, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    Parameters:
      -
    • min_len – The min filter length in this op, samples will -be filtered if their average line length is below this -parameter.

    • -
    • max_len – The max filter length in this op, samples will -be filtered if their average line length exceeds this -parameter.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    +
    Returns:
    +

    sample with computed stats

    +
    -
    -compute_stats_batched(samples, context=False)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    +
    +process_single(sample, rank=None)[source]
    +

    For sample level, sample –> Boolean.

    +
    +
    Parameters:
    +

    sample – sample to decide whether to filter

    +
    +
    Returns:
    +

    true for keeping and false for filtering

    +
    +
    +
    -
    -class data_juicer.ops.filter.SpecifiedFieldFilter(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageShapeFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter based on specified field information.

    -

    If the specified field information in the sample is not within the -specified target value, the sample will be filtered.

    +

    Filter to keep samples with image shape (w, h) within specific ranges.

    -
    -__init__(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +
    +__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • field_key – Filter based on the specified value -corresponding to the target key. The target key -corresponding to multi-level field information need to be -separated by ‘.’.

    • -
    • target_value – The range of specified field information -corresponding to the samples that need to be retained.

    • +
    • min_width – The min width to keep samples.

    • +
    • max_width – The max width to keep samples.

    • +
    • min_height – The min height to keep samples.

    • +
    • max_height – The max height to keep samples.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -925,8 +914,8 @@
    -
    -compute_stats_single(sample)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -944,8 +933,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -960,40 +949,25 @@
    -
    -class data_juicer.ops.filter.VideoTaggingFromFramesFilter(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose videos contain the given tags.

    +

    Keep data samples whose image size (in Bytes/KB/MB/…) within a +specific range.

    -
    -__init__(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • tags – a tag list to shift the videos, total tags can be found -in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt # noqa: E501

    • -
    • contain – require the videos containing ‘any’ or ‘all’ tags. -When tags equal to [], ‘all’ keeps all samples, ‘any’ keeps no -sample.

    • -
    • frame_sampling_method – sampling method of extracting frame -images from the videos. Should be one of -[“all_keyframes”, “uniform”]. -The former one extracts all key frames (the number of which depends -on the duration of the video) and the latter one extract specified -number of frames uniformly from the video. -Default: “all_keyframes”.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

    • -
    • tag_field_name – the field name to store the tags. It’s -“__dj__video_frame_tags__” in default.

    • +
    • min_size – The min image size to keep samples. set to be “0” by +default for no size constraint

    • +
    • max_size – The max image size to keep samples. set to be +“1TB” by default, an approximate for un-limited case

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -1003,8 +977,8 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1022,8 +996,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1038,34 +1012,43 @@
    -
    -class data_juicer.ops.filter.TextEntityDependencyFilter(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageTextMatchingFilter(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    -

    Identify the entities in the text which are independent with other token, -and filter them. The text containing no entities will be omitted.

    +

    Filter to keep samples those matching score between image and text +within a specific range.

    -
    -__init__(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]
    +
    +__init__(hf_blip: str = 'Salesforce/blip-itm-base-coco', trust_remote_code: bool = False, min_score: float = 0.003, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – language of the text in the samples. ‘en’ for detection of -entities in English and ‘zh’ for detection of entities in Chinese.

    • -
    • mini_dependency_num – The min token number in the filtering. -Objects is independent if their number of edges in the dependency -tree is below this parameter.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy. -‘any’: keep this sample if any objet is dependent. ‘all’: keep this -sample only if all images are dependent.

    • +
    • hf_blip – blip model name on huggingface to compute +the matching score between image and text.

    • +
    • min_score – The min matching score to keep samples.

    • +
    • max_score – The max matching score to keep samples.

    • +
    • horizontal_flip – Flip image horizontally (left to right).

    • +
    • vertical_flip – Flip image vertically (top to bottom).

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • +
    • reduce_mode – reduce mode when one text corresponds to +multiple images in a chunk. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1083,8 +1066,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1099,25 +1082,33 @@
    -
    -class data_juicer.ops.filter.VideoResolutionFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.ImageTextSimilarityFilter(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose videos’ resolutions are within a specified range.

    +

    Filter to keep samples those similarities between image and text +within a specific range.

    -
    -__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(hf_clip: str = 'openai/clip-vit-base-patch32', trust_remote_code: bool = False, min_score: float = 0.1, max_score: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_width – The min horizontal resolution.

    • -
    • max_width – The max horizontal resolution.

    • -
    • min_height – The min vertical resolution.

    • -
    • max_height – The max vertical resolution.

    • +
    • hf_clip – clip model name on huggingface to compute +the similarity between image and text.

    • +
    • min_score – The min similarity to keep samples.

    • +
    • max_score – The max similarity to keep samples.

    • +
    • horizontal_flip – Flip image horizontally (left to right).

    • +
    • vertical_flip – Flip image vertically (top to bottom).

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • +
    • reduce_mode – reduce mode when one text corresponds to +multiple images in a chunk. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • args – extra args

    • kwargs – extra args

    @@ -1126,8 +1117,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1145,8 +1136,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1160,48 +1151,6 @@
    -
    -
    -class data_juicer.ops.filter.AlphanumericFilter(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]
    -

    Bases: Filter

    -

    Filter to keep samples with alphabet/numeric ratio within a specific -range.

    -
    -
    -__init__(tokenization: bool = False, min_ratio: float = 0.25, max_ratio: float = 9223372036854775807, *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • tokenization – Whether to count the ratio of alphanumeric -to the total number of tokens. if tokenization=False, it -will count the ratio of alphanumeric to the total number of -characters.

    • -
    • min_ratio – The min filter ratio in alphanumeric op, -samples will be filtered if their alphabet/numeric ratio is -below this parameter.

    • -
    • max_ratio – The max filter ratio in alphanumeric op, -samples will be filtered if their alphabet/numeric ratio -exceeds this parameter.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -compute_stats_batched(samples)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    - -
    -
    class data_juicer.ops.filter.ImageWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, any_or_all: str = 'any', *args, **kwargs)[source]
    @@ -1267,37 +1216,31 @@
    -
    -class data_juicer.ops.filter.ImageAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.LanguageIDScoreFilter(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with aesthetics scores within a specific range.

    +

    Filter to keep samples in a specific language with confidence score +larger than a specific min value.

    -
    -__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.5, max_score: float = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_scorer_model – Huggingface model name for the aesthetics -predictor. By default, we will use -‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’, -refer to pypi.org/project/simple-aesthetics-predictor

    • -
    • min_score – Min score for the predicted aesthetics in an image.

    • -
    • max_score – Max score for the predicted aesthetics in an image.

    • -
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    • -
    • args – Extra positional arguments.

    • -
    • kwargs – Extra keyword arguments.

    • +
    • lang – Samples in which languages to keep.

    • +
    • min_score – The min language identification confidence +scores of samples to keep.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1315,8 +1258,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1331,26 +1274,117 @@
    -
    -class data_juicer.ops.filter.AudioSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.MaximumLineLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose audio size (in bytes/kb/MB/…) within a -specific range.

    +

    Filter to keep samples with maximum line length within a specific +range.

    -
    -__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_size – The min audio size to keep samples. set to be “0” by -default for no size constraint

    • -
    • max_size – The max audio size to keep samples. set to be -“1Tb” by default, an approximate for un-limited case

    • +
    • min_len – The min filter length in this op, samples will +be filtered if their maximum line length is below this +parameter.

    • +
    • max_len – The max filter length in this op, samples will +be filtered if their maximum line length exceeds this +parameter.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +compute_stats_batched(samples, context=False)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    +
    + +
    + +
    +
    +class data_juicer.ops.filter.PerplexityFilter(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +

    Bases: Filter

    +

    Filter to keep samples with perplexity score less than a specific max +value.

    +
    +
    +__init__(lang: str = 'en', max_ppl: float = 1500, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • lang – Compute perplexity for samples in which language.

    • +
    • max_ppl – The max filter perplexity in this op, samples +will be filtered if their perplexity exceeds this parameter.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +compute_stats_batched(samples, context=False)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    +
    + +
    + +
    +
    +class data_juicer.ops.filter.PhraseGroundingRecallFilter(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +

    Bases: Filter

    +

    Filter to keep samples whose locating recalls of phrases extracted +from text in the images are within a specified range.

    +
    +
    +__init__(hf_owlvit: str = 'google/owlvit-base-patch32', trust_remote_code: bool = False, min_recall: float = 0.1, max_recall: float = 1.0, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', iou_thr: float = 0.5, large_area_ratio_thr: float = 0.95, conf_thr: float = 0.0, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • hf_owlvit – Owl-ViT model name on huggingface to locate the +phrases extracted from the text.

    • +
    • min_recall – The min phrase grounding recall to keep samples.

    • +
    • max_recall – The max phrase grounding recall to keep samples.

    • +
    • horizontal_flip – Flip image horizontally (left to right).

    • +
    • vertical_flip – Flip image vertically (top to bottom).

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all audios. ‘any’: keep this sample if any audios meet the -condition. ‘all’: keep this sample only if all audios meet the +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the condition.

    • +
    • reduce_mode – reduce mode when one text corresponds to +multiple images in a chunk. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • +
    • iou_thr – the IoU threshold for NMS-like post-process. If two +predicted bboxes are overlap with an IoU larger than this +threshold, the bbox with less confidence will be removed. Default: +0.5.

    • +
    • large_area_ratio_thr – the area ratio threshold for filtering out +those large predicted bboxes. If the area of a predicted bbox +accounts for more than this ratio threshold of the whole image +area, this bbox will be removed. Default: 0.95.

    • +
    • conf_thr – the confidence score threshold for removing +low-confidence bboxes. If the confidence score of a predicted bbox +is lower than the threshold, this bbox will be removed. Default: 0.

    • args – extra args

    • kwargs – extra args

    @@ -1359,8 +1393,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1378,8 +1412,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1394,30 +1428,24 @@
    -
    -class data_juicer.ops.filter.StopWordsFilter(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.SpecialCharactersFilter(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with stopword ratio larger than a specific min -value.

    +

    Filter to keep samples with special-char ratio within a specific +range.

    -
    -__init__(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +
    +__init__(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – Consider stopwords in what language. If lang == -“all”, we will adopt the one merged from all the available -languages

    • -
    • tokenization – whether to use model to tokenize documents

    • -
    • min_ratio – The min filter ratio in this op.

    • -
    • stopwords_dir – The directory storing the stopwords -file(s) whose name includes “stopwords” and in json format

    • -
    • use_words_aug – Whether to augment words, especially for -Chinese and Vietnamese

    • -
    • words_aug_group_sizes – The group size of words to augment

    • -
    • words_aug_join_char – The join char between words to -augment

    • +
    • min_ratio – The min filter ratio in this op, samples will +be filtered if their special-char ratio is below this +parameter.

    • +
    • max_ratio – The max filter ratio in this op, samples will +be filtered if their special-char ratio exceeds this +parameter.

    • args – extra args

    • kwargs – extra args

    @@ -1426,8 +1454,47 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_batched(samples)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    +
    + +
    + +
    +
    +class data_juicer.ops.filter.SpecifiedFieldFilter(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +

    Bases: Filter

    +

    Filter based on specified field information.

    +

    If the specified field information in the sample is not within the +specified target value, the sample will be filtered.

    +
    +
    +__init__(field_key: str = '', target_value: List = [], *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • field_key – Filter based on the specified value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

    • +
    • target_value – The range of specified field information +corresponding to the samples that need to be retained.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1445,8 +1512,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1461,64 +1528,29 @@
    -
    -class data_juicer.ops.filter.CharacterRepetitionFilter(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]
    -

    Bases: Filter

    -

    Filter to keep samples with char-level n-gram repetition ratio within a -specific range.

    -
    -
    -__init__(rep_len: int[int] = 10, min_ratio: float = 0.0, max_ratio: float = 0.5, *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • rep_len – Repetition length for char-level n-gram.

    • -
    • min_ratio – The min filter ratio in this op, samples will -be filtered if their char-level n-gram repetition ratio is -below this parameter.

    • -
    • max_ratio – The max filter ratio in this op, samples will -be filtered if their char-level n-gram repetition ratio -exceeds this parameter.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -compute_stats_batched(samples)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    - -
    - -
    -
    -class data_juicer.ops.filter.ImageShapeFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.SpecifiedNumericFieldFilter(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with image shape (w, h) within specific ranges.

    +

    Filter based on specified numeric field information.

    +

    If the specified numeric information in the sample is not within the +specified range, the sample will be filtered.

    -
    -__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(field_key: str = '', min_value: float = -9223372036854775807, max_value: float = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_width – The min width to keep samples.

    • -
    • max_width – The max width to keep samples.

    • -
    • min_height – The min height to keep samples.

    • -
    • max_height – The max height to keep samples.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    • +
    • field_key – Filter based on the specified numeric value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

    • +
    • min_value – The min filter value in SpecifiedNumericField +op, samples will be filtered if their specified numeric +field value is below this parameter.

    • +
    • max_value – The max filter value in SpecifiedNumericField +op, samples will be filtered if their specified numeric +field value exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    @@ -1527,8 +1559,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1546,8 +1578,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1562,25 +1594,30 @@
    -
    -class data_juicer.ops.filter.VideoDurationFilter(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.StopWordsFilter(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose videos’ durations are within a specified range.

    +

    Filter to keep samples with stopword ratio larger than a specific min +value.

    -
    -__init__(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', tokenization: bool = False, min_ratio: float = 0.3, stopwords_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_duration – The min video duration to keep samples in seconds. -It’s 0 by default.

    • -
    • max_duration – The max video duration to keep samples in seconds. -It’s sys.maxsize by default.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • +
    • lang – Consider stopwords in what language. If lang == +“all”, we will adopt the one merged from all the available +languages

    • +
    • tokenization – whether to use model to tokenize documents

    • +
    • min_ratio – The min filter ratio in this op.

    • +
    • stopwords_dir – The directory storing the stopwords +file(s) whose name includes “stopwords” and in json format

    • +
    • use_words_aug – Whether to augment words, especially for +Chinese and Vietnamese

    • +
    • words_aug_group_sizes – The group size of words to augment

    • +
    • words_aug_join_char – The join char between words to +augment

    • args – extra args

    • kwargs – extra args

    @@ -1589,8 +1626,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1608,8 +1645,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1624,30 +1661,29 @@
    -
    -class data_juicer.ops.filter.TextActionFilter(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.SuffixFilter(suffixes: str | List[str] = [], *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep texts those contain actions in the text.

    +

    Filter to keep samples with specified suffix.

    -
    -__init__(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]
    +
    +__init__(suffixes: str | List[str] = [], *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – language of the text in the samples. ‘en’ for detection of -actions in English and ‘zh’ for detection of actions in Chinese.

    • -
    • mini_action_num – The min action number in the filtering. samples -will be filtered if their action number in the text is below this -parameter.

    • +
    • suffixes – the suffix of text that will be keep. +For example: ‘.txt’, ‘txt’ or [‘txt’, ‘.pdf’, ‘docx’]

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1665,8 +1701,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1681,49 +1717,30 @@
    -
    -class data_juicer.ops.filter.VideoOcrAreaRatioFilter(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.TextActionFilter(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose detected text area ratios for specified frames -in the video are within a specified range.

    +

    Filter to keep texts those contain actions in the text.

    -
    -__init__(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', min_action_num: int = 1, *args, **kwargs)[source]

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • min_area_ratio – The min ocr area ratio to keep samples. It’s 0 -by default.

    • -
    • max_area_ratio – The max ocr area ratio to keep samples. It’s 1.0 -by default.

    • -
    • frame_sample_num – The number of sampled frames to calculate the -ocr area ratio. If it’s 1, only middle frame will be selected. If -it’s 2, only the first and the last frames will be selected. If -it’s larger than 2, in addition to the first and the last frames, -other frames will be sampled evenly within the video duration.

    • -
    • languages_to_detect – texts in which languages should be -detected. Default: [‘ch_sim’, ‘en’]. Full language list can be -found here: https://www.jaided.ai/easyocr/.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
      +
      Parameters:
      +
        +
      • lang – language of the text in the samples. ‘en’ for detection of +actions in English and ‘zh’ for detection of actions in Chinese.

      • +
      • mini_action_num – The min action number in the filtering. samples +will be filtered if their action number in the text is below this +parameter.

    -
    -get_reader(rank)[source]
    -
    - -
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1741,8 +1758,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1757,52 +1774,34 @@
    -
    -class data_juicer.ops.filter.VideoNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.TextEntityDependencyFilter(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose videos have low nsfw scores.

    +

    Identify the entities in the text which are independent with other token, +and filter them. The text containing no entities will be omitted.

    -
    -__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', min_dependency_num: int = 1, any_or_all: str = 'all', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_nsfw_model – nsfw detection model name on huggingface.

    • -
    • score_threshold – the nsfw score threshold for samples. -range from 0 to 1. Samples with nsfw score less than this threshold -will be kept.

    • -
    • frame_sampling_method – sampling method of extracting frame -images from the videos. -Should be one of [“all_keyframes”, “uniform”]. -The former one extracts all key frames (the number of which depends -on the duration of the video) and the latter one extract specified -number of frames uniformly from the video. -Default: “all_keyframes”.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

    • -
    • reduce_mode – reduce mode for multiple sampled video frames. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • lang – language of the text in the samples. ‘en’ for detection of +entities in English and ‘zh’ for detection of entities in Chinese.

    • +
    • mini_dependency_num – The min token number in the filtering. +Objects is independent if their number of edges in the dependency +tree is below this parameter.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy. +‘any’: keep this sample if any objet is dependent. ‘all’: keep this +sample only if all images are dependent.

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1820,8 +1819,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1836,23 +1835,23 @@
    -
    -class data_juicer.ops.filter.SpecialCharactersFilter(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.TextLengthFilter(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with special-char ratio within a specific +

    Filter to keep samples with total text length within a specific range.

    -
    -__init__(min_ratio: float = 0.0, max_ratio: float = 0.25, *args, **kwargs)[source]
    +
    +__init__(min_len: int = 10, max_len: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_ratio – The min filter ratio in this op, samples will -be filtered if their special-char ratio is below this +

    • min_len – The min text length in the filtering. samples +will be filtered if their text length is below this parameter.

    • -
    • max_ratio – The max filter ratio in this op, samples will -be filtered if their special-char ratio exceeds this +

    • max_len – The max text length in the filtering. samples +will be filtered if their text length exceeds this parameter.

    • args – extra args

    • kwargs – extra args

    • @@ -1862,60 +1861,37 @@
    -
    -compute_stats_batched(samples)[source]
    +
    +compute_stats_batched(samples)[source]
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.filter.VideoFramesTextSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.TokenNumFilter(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples those similarities between sampled video frame -images and text within a specific range.

    +

    Filter to keep samples with total token number within a specific +range.

    -
    -__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]
    +
    +__init__(hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_clip – clip model name on huggingface to compute -the similarity between frame image and text. It’s kind of -language-related. For example, for Chinese datasets, ChineseCLIP -might be a better choice.

    • -
    • min_score – the min similarity to keep samples.

    • -
    • max_score – the max similarity to keep samples.

    • -
    • frame_sampling_method – sampling method of extracting frame -images from the videos. -Should be one of [“all_keyframes”, “uniform”]. -The former one extracts all key frames (the number of which depends -on the duration of the video) and the latter one extract specified -number of frames uniformly from the video. -Default: “all_keyframes”.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

    • -
    • horizontal_flip – flip frame image horizontally (left to right).

    • -
    • vertical_flip – flip frame image vertically (top to bottom).

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • -
    • reduce_mode – reduce mode when one text corresponds to -multiple video frame images in a chunk. -‘avg’: Take the average of multiple values -‘max’: Take the max of multiple values -‘min’: Take the min of multiple values

    • +
    • hf_tokenizer – the tokenizer name of Hugging Face tokenizers.

    • +
    • min_num – The min filter token number in this op, samples +will be filtered if their token number is below this +parameter.

    • +
    • max_num – The max filter token number in this op, samples +will be filtered if their token number exceeds this +parameter.

    • args – extra args

    • kwargs – extra args

    @@ -1924,8 +1900,8 @@
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -1943,8 +1919,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -1959,34 +1935,57 @@
    -
    -class data_juicer.ops.filter.ImageAspectRatioFilter(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoAestheticsFilter(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with image aspect ratio within a specific range. -AspectRatio = W / H.

    +

    Filter to keep data samples with aesthetics scores for specified frames +in the videos within a specific range.

    -
    -__init__(min_ratio: float = 0.333, max_ratio: float = 3.0, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(hf_scorer_model: str = '', trust_remote_code: bool = False, min_score: float = 0.4, max_score: float = 1.0, frame_sampling_method: str = 'uniform', frame_num: int[int] = 3, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_ratio – The min aspect ratio to keep samples.

    • -
    • max_ratio – The max aspect ratio to keep samples.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +

    • hf_scorer_model – Huggingface model name for the aesthetics +predictor. By default, we will use +‘shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE’, +refer to pypi.org/project/simple-aesthetics-predictor

    • +
    • min_score – Min score for the predicted aesthetics in a video.

    • +
    • max_score – Max score for the predicted aesthetics in a video.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. +Should be one of [“all_keyframes”, “uniform”]. +The former one extracts all key frames and the latter one extract +specified number of frames uniformly from the video. +Default: “uniform” with frame_num=3, considering that the number of +keyframes can be large while their difference is usually small +in terms of their aesthetics.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • reduce_mode – reduce mode when one sample corresponds to +multiple frames, must be one of [‘avg’,’max’, ‘min’]. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • +
    • args – Extra positional arguments.

    • +
    • kwargs – Extra keyword arguments.

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2004,8 +2003,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2020,24 +2019,25 @@
    -
    -class data_juicer.ops.filter.AudioDurationFilter(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoAspectRatioFilter(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose audios’ durations are within a specified range.

    +

    Filter to keep samples with video aspect ratio within a specific range. +AspectRatio = W / H.

    -
    -__init__(min_duration: int = 0, max_duration: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_duration – The min audio duration to keep samples in seconds. -It’s 0 by default.

    • -
    • max_duration – The max audio duration to keep samples in seconds. -It’s sys.maxsize by default.

    • +
    • min_ratio – The minimum aspect ratio to keep samples, +supported format is a string, such as “9:21” or “9/21”.

    • +
    • max_ratio – The maximum aspect ratio to keep samples, +supported format is a string, such as “21:9” or “21/9”.

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all audios. ‘any’: keep this sample if any audios meet the -condition. ‘all’: keep this sample only if all audios meet the +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -2047,8 +2047,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2066,8 +2066,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2082,21 +2082,25 @@
    -
    -class data_juicer.ops.filter.LanguageIDScoreFilter(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoDurationFilter(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples in a specific language with confidence score -larger than a specific min value.

    +

    Keep data samples whose videos’ durations are within a specified range.

    -
    -__init__(lang: str | List[str] = '', min_score: float = 0.8, *args, **kwargs)[source]
    +
    +__init__(min_duration: float = 0, max_duration: float = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – Samples in which languages to keep.

    • -
    • min_score – The min language identification confidence -scores of samples to keep.

    • +
    • min_duration – The min video duration to keep samples in seconds. +It’s 0 by default.

    • +
    • max_duration – The max video duration to keep samples in seconds. +It’s sys.maxsize by default.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -2105,8 +2109,8 @@
    -
    -compute_stats_single(sample)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2124,8 +2128,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2140,19 +2144,48 @@
    -
    -class data_juicer.ops.filter.SuffixFilter(suffixes: str | List[str] = [], *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoFramesTextSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with specified suffix.

    +

    Filter to keep samples those similarities between sampled video frame +images and text within a specific range.

    -
    -__init__(suffixes: str | List[str] = [], *args, **kwargs)[source]
    +
    +__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: float = 0.1, max_score: float = 1.0, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, any_or_all: str = 'any', reduce_mode: str = 'avg', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • suffixes – the suffix of text that will be keep. -For example: ‘.txt’, ‘txt’ or [‘txt’, ‘.pdf’, ‘docx’]

    • +
    • hf_clip – clip model name on huggingface to compute +the similarity between frame image and text. It’s kind of +language-related. For example, for Chinese datasets, ChineseCLIP +might be a better choice.

    • +
    • min_score – the min similarity to keep samples.

    • +
    • max_score – the max similarity to keep samples.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. +Should be one of [“all_keyframes”, “uniform”]. +The former one extracts all key frames (the number of which depends +on the duration of the video) and the latter one extract specified +number of frames uniformly from the video. +Default: “all_keyframes”.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • horizontal_flip – flip frame image horizontally (left to right).

    • +
    • vertical_flip – flip frame image vertically (top to bottom).

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the +condition.

    • +
    • reduce_mode – reduce mode when one text corresponds to +multiple video frame images in a chunk. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • args – extra args

    • kwargs – extra args

    @@ -2161,8 +2194,8 @@
    -
    -compute_stats_single(sample)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2180,8 +2213,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2196,25 +2229,37 @@
    -
    -class data_juicer.ops.filter.ImageSizeFilter(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoMotionScoreFilter(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Keep data samples whose image size (in Bytes/KB/MB/…) within a -specific range.

    +

    Filter to keep samples with video motion scores within a specific range. The +Farneback’s algorith from OpenCV is used to compute dense optical flow.

    -
    -__init__(min_size: str = '0', max_size: str = '1TB', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_size – The min image size to keep samples. set to be “0” by -default for no size constraint

    • -
    • max_size – The max image size to keep samples. set to be -“1TB” by default, an approximate for un-limited case

    • +
    • min_score – The minimum motion score to keep samples.

    • +
    • max_score – The maximum motion score to keep samples.

    • +
    • sampling_fps – The sampling rate in frames_per_second for +optical flow calculations.

    • +
    • size – Resize frames before computing optical flow. If size is a +sequence like (h, w), frame size will be matched to this. If size +is an int, smaller edge of frames will be matched to this number. +i.e, if height > width, then frame will be rescaled to (size * +height / width, size). Default None to keep the original size.

    • +
    • max_size – The maximum allowed for the longer edge of resized +frames. If the longer edge of frames is greater than max_size after +being resized according to size, size will be overruled so that the +longer edge is equal to max_size. As a result, the smaller edge may +be shorter than size. This is only supported if size is an int.

    • +
    • relative – If True, the optical flow magnitude is normalized to +a [0, 1] range, relative to the frame’s diagonal length.

    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • args – extra args

    • kwargs – extra args

    • @@ -2224,8 +2269,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2243,8 +2288,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2259,23 +2304,21 @@
    -
    -class data_juicer.ops.filter.VideoWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoNSFWFilter(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples whose videos have no watermark with high -probability.

    +

    Filter to keep samples whose videos have low nsfw scores.

    -
    -__init__(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', trust_remote_code: bool = False, score_threshold: float = 0.5, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_watermark_model – watermark detection model name on -huggingface.

    • -
    • prob_threshold – the predicted watermark probability threshold -for samples. range from 0 to 1. Samples with watermark probability -less than this threshold will be kept.

    • +
    • hf_nsfw_model – nsfw detection model name on huggingface.

    • +
    • score_threshold – the nsfw score threshold for samples. +range from 0 to 1. Samples with nsfw score less than this threshold +will be kept.

    • frame_sampling_method – sampling method of extracting frame images from the videos. Should be one of [“all_keyframes”, “uniform”]. @@ -2305,8 +2348,8 @@

    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2324,8 +2367,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2340,26 +2383,34 @@
    -
    -class data_juicer.ops.filter.WordsNumFilter(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoOcrAreaRatioFilter(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with total words number within a specific -range.

    +

    Keep data samples whose detected text area ratios for specified frames +in the video are within a specified range.

    -
    -__init__(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(min_area_ratio: float = 0, max_area_ratio: float = 1.0, frame_sample_num: int[int] = 3, languages_to_detect: str | List[str] = ['ch_sim', 'en'], any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – sample in which language.

    • -
    • tokenization – whether to use model to tokenize documents

    • -
    • min_num – The min filter word number in this op, samples -will be filtered if their word number is below this -parameter.

    • -
    • max_num – The max filter word number in this op, samples -will be filtered if their word number exceeds this -parameter.

    • +
    • min_area_ratio – The min ocr area ratio to keep samples. It’s 0 +by default.

    • +
    • max_area_ratio – The max ocr area ratio to keep samples. It’s 1.0 +by default.

    • +
    • frame_sample_num – The number of sampled frames to calculate the +ocr area ratio. If it’s 1, only middle frame will be selected. If +it’s 2, only the first and the last frames will be selected. If +it’s larger than 2, in addition to the first and the last frames, +other frames will be sampled evenly within the video duration.

    • +
    • languages_to_detect – texts in which languages should be +detected. Default: [‘ch_sim’, ‘en’]. Full language list can be +found here: https://www.jaided.ai/easyocr/.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -2368,47 +2419,75 @@
    -
    -compute_stats_batched(samples, context=False)[source]
    +
    +get_reader(rank)[source]
    -
    -process_batched(samples)[source]
    -
    +
    +compute_stats_single(sample, rank=None, context=False)[source]
    +

    Compute stats for the sample which is used as a metric to decide +whether to filter this sample.

    +
    +
    Parameters:
    +
      +
    • sample – input sample.

    • +
    • context – whether to store context information of intermediate +vars in the sample temporarily.

    • +
    +
    +
    Returns:
    +

    sample with computed stats

    +
    +
    +
    + +
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> Boolean.

    +
    +
    Parameters:
    +

    sample – sample to decide whether to filter

    +
    +
    Returns:
    +

    true for keeping and false for filtering

    +
    +
    +
    -
    -class data_juicer.ops.filter.ImageFaceCountFilter(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoResolutionFilter(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with the number of faces within a specific range.

    +

    Keep data samples whose videos’ resolutions are within a specified range.

    -
    -__init__(cv_classifier: str = '', min_face_count: int = 1, max_face_count: int = 1, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(min_width: int = 1, max_width: int = 9223372036854775807, min_height: int = 1, max_height: int = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • cv_classifier – OpenCV classifier path for face detection. -By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • -
    • min_face_count – Minimum number of faces required for samples.

    • -
    • max_face_count – Maximum number of faces required for samples.

    • -
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +

    • min_width – The min horizontal resolution.

    • +
    • max_width – The max horizontal resolution.

    • +
    • min_height – The min vertical resolution.

    • +
    • max_height – The max vertical resolution.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • -
    • args – Extra positional arguments.

    • -
    • kwargs – Extra keyword arguments.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2426,8 +2505,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2442,35 +2521,51 @@
    -
    -class data_juicer.ops.filter.ImageFaceRatioFilter(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoTaggingFromFramesFilter(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with face area ratios within a specific range.

    +

    Filter to keep samples whose videos contain the given tags.

    -
    -__init__(cv_classifier: str = '', min_ratio: float = 0.0, max_ratio: float = 0.4, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(tags: List[str] = ['people'], contain: str = 'any', frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • cv_classifier – OpenCV classifier path for face detection. -By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • -
    • min_ratio – Min ratio for the largest face area in an image.

    • -
    • max_ratio – Max ratio for the largest face area in an image.

    • -
    • any_or_all – Keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the +

    • tags – a tag list to shift the videos, total tags can be found +in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt # noqa: E501

    • +
    • contain – require the videos containing ‘any’ or ‘all’ tags. +When tags equal to [], ‘all’ keeps all samples, ‘any’ keeps no +sample.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. Should be one of +[“all_keyframes”, “uniform”]. +The former one extracts all key frames (the number of which depends +on the duration of the video) and the latter one extract specified +number of frames uniformly from the video. +Default: “all_keyframes”.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • tag_field_name – the field name to store the tags. It’s +“__dj__video_frame_tags__” in default.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the condition.

    • -
    • args – Extra positional arguments.

    • -
    • kwargs – Extra keyword arguments.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2488,8 +2583,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2504,31 +2599,44 @@
    -
    -class data_juicer.ops.filter.FlaggedWordFilter(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.VideoWatermarkFilter(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with flagged-word ratio less than a specific max -value.

    +

    Filter to keep samples whose videos have no watermark with high +probability.

    -
    -__init__(lang: str = 'en', tokenization: bool = False, max_ratio: float = 0.045, flagged_words_dir: str = '/home/runner/.cache/data_juicer/assets', use_words_aug: bool = False, words_aug_group_sizes: List[int[int]] = [2], words_aug_join_char: str = '', *args, **kwargs)[source]
    +
    +__init__(hf_watermark_model: str = 'amrul-hzz/watermark_detector', trust_remote_code: bool = False, prob_threshold: float = 0.8, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, reduce_mode: str = 'avg', any_or_all: str = 'any', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – Consider flagged words in what language. If lang == -“all”, we will adopt the one merged from all the available -languages

    • -
    • tokenization – Whether to use model to tokenize documents

    • -
    • max_ratio – The max filter ratio in this op.

    • -
    • flagged_words_dir – The directory storing the -flagged_words file(s) whose name includes “flagged_words” -and in json format

    • -
    • use_words_aug – Whether to augment words, especially for -Chinese and Vietnamese

    • -
    • words_aug_group_sizes – The group size of words to augment

    • -
    • words_aug_join_char – The join char between words to -augment

    • +
    • hf_watermark_model – watermark detection model name on +huggingface.

    • +
    • prob_threshold – the predicted watermark probability threshold +for samples. range from 0 to 1. Samples with watermark probability +less than this threshold will be kept.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. +Should be one of [“all_keyframes”, “uniform”]. +The former one extracts all key frames (the number of which depends +on the duration of the video) and the latter one extract specified +number of frames uniformly from the video. +Default: “all_keyframes”.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • reduce_mode – reduce mode for multiple sampled video frames. +‘avg’: Take the average of multiple values +‘max’: Take the max of multiple values +‘min’: Take the min of multiple values

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all videos. ‘any’: keep this sample if any videos meet the +condition. ‘all’: keep this sample only if all videos meet the +condition.

    • args – extra args

    • kwargs – extra args

    @@ -2537,8 +2645,8 @@
    -
    -compute_stats_single(sample, context=False)[source]
    +
    +compute_stats_single(sample, rank=None, context=False)[source]

    Compute stats for the sample which is used as a metric to decide whether to filter this sample.

    @@ -2556,8 +2664,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None)[source]

    For sample level, sample –> Boolean.

    Parameters:
    @@ -2613,38 +2721,26 @@
    -
    -class data_juicer.ops.filter.VideoMotionScoreFilter(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.filter.WordsNumFilter(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Bases: Filter

    -

    Filter to keep samples with video motion scores within a specific range. The -Farneback’s algorith from OpenCV is used to compute dense optical flow.

    +

    Filter to keep samples with total words number within a specific +range.

    -
    -__init__(min_score: float = 0.25, max_score: float = 1.7976931348623157e+308, sampling_fps: float[float] = 2, size: int[int] | Tuple[int[int]] | Tuple[int[int], int[int]] | None = None, max_size: int[int] | None = None, relative: bool = False, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(lang: str = 'en', tokenization: bool = False, min_num: int = 10, max_num: int = 9223372036854775807, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_score – The minimum motion score to keep samples.

    • -
    • max_score – The maximum motion score to keep samples.

    • -
    • sampling_fps – The sampling rate in frames_per_second for -optical flow calculations.

    • -
    • size – Resize frames before computing optical flow. If size is a -sequence like (h, w), frame size will be matched to this. If size -is an int, smaller edge of frames will be matched to this number. -i.e, if height > width, then frame will be rescaled to (size * -height / width, size). Default None to keep the original size.

    • -
    • max_size – The maximum allowed for the longer edge of resized -frames. If the longer edge of frames is greater than max_size after -being resized according to size, size will be overruled so that the -longer edge is equal to max_size. As a result, the smaller edge may -be shorter than size. This is only supported if size is an int.

    • -
    • relative – If True, the optical flow magnitude is normalized to -a [0, 1] range, relative to the frame’s diagonal length.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all videos. ‘any’: keep this sample if any videos meet the -condition. ‘all’: keep this sample only if all videos meet the -condition.

    • +
    • lang – sample in which language.

    • +
    • tokenization – whether to use model to tokenize documents

    • +
    • min_num – The min filter word number in this op, samples +will be filtered if their word number is below this +parameter.

    • +
    • max_num – The max filter word number in this op, samples +will be filtered if their word number exceeds this +parameter.

    • args – extra args

    • kwargs – extra args

    @@ -2653,110 +2749,14 @@
    -
    -compute_stats_single(sample, context=False)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    - -
    -
    -process_single(sample)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    - -
    - -
    -
    -class data_juicer.ops.filter.ImagePairSimilarityFilter(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    -

    Bases: Filter

    -

    Filter to keep image pairs with similarities between images -within a specific range.

    -
    -
    -__init__(hf_clip='openai/clip-vit-base-patch32', trust_remote_code=False, min_score: ClosedUnitInterval = 0.1, max_score: ClosedUnitInterval = 1.0, any_or_all: str = 'any', *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    -
    param hf_clip:
    -

    clip model name on huggingface to compute -the similarity between image and text.

    -
    -
    param min_score:
    -

    The min similarity to keep samples.

    -
    -
    param max_score:
    -

    The max similarity to keep samples.

    -
    -
    param any_or_all:
    -

    keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    -
    -
    param args:
    -

    extra args

    -
    -
    param kwargs:
    -

    extra args

    -
    -
    -
    -
    - -
    -
    -compute_stats_single(sample, rank=None, context=False)[source]
    -

    Compute stats for the sample which is used as a metric to decide -whether to filter this sample.

    -
    -
    Parameters:
    -
      -
    • sample – input sample.

    • -
    • context – whether to store context information of intermediate -vars in the sample temporarily.

    • -
    -
    -
    Returns:
    -

    sample with computed stats

    -
    -
    -
    +
    +compute_stats_batched(samples, context=False)[source]
    +
    -
    -process_single(sample, rank=None)[source]
    -

    For sample level, sample –> Boolean.

    -
    -
    Parameters:
    -

    sample – sample to decide whether to filter

    -
    -
    Returns:
    -

    true for keeping and false for filtering

    -
    -
    -
    +
    +process_batched(samples)[source]
    +
    diff --git a/data_juicer.ops.mapper.html b/data_juicer.ops.mapper.html index baf51b62e..16326ffcb 100644 --- a/data_juicer.ops.mapper.html +++ b/data_juicer.ops.mapper.html @@ -46,53 +46,55 @@
  • data_juicer.ops
  • data_juicer.ops.filter
  • data_juicer.ops.mapper
  • data_juicer.ops.deduplicator
  • @@ -130,53 +132,22 @@

    data_juicer.ops.mapper

    -
    -class data_juicer.ops.mapper.VideoCaptioningFromAudioMapper(keep_original_sample: bool = True, *args, **kwargs)[source]
    -

    Bases: Mapper

    -

    Mapper to caption a video according to its audio streams based on -Qwen-Audio model.

    -
    -
    -__init__(keep_original_sample: bool = True, *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only captioned sample in the -final datasets and the original sample will be removed. It’s True -in default.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -process_batched(samples, rank=None)[source]
    -
    - -
    - -
    -
    -class data_juicer.ops.mapper.VideoTaggingFromAudioMapper(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.AudioFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to generate video tags from audio streams extracted by video -using the Audio Spectrogram Transformer.

    +

    Simple wrapper for FFmpeg audio filters.

    -
    -__init__(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]
    +
    +__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_ast – path to the HF model to tag from audios.

    • -
    • trust_remote_code – whether to trust the remote code of HF models

    • -
    • tag_field_name – the field name to store the tags. It’s -“__dj__video_audio_tags__” in default.

    • +
    • filter_name – ffmpeg audio filter name.

    • +
    • filter_kwargs – keyword-arguments passed to ffmpeg filter.

    • +
    • global_args – list-arguments passed to ffmpeg command-line.

    • +
    • capture_stderr – whether to capture stderr.

    • +
    • overwrite_output – whether to overwrite output file.

    • args – extra args

    • kwargs – extra args

    @@ -185,8 +156,8 @@
    -
    -process_single(sample, rank=None)[source]
    +
    +process_single(sample)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -201,44 +172,42 @@
    -
    -class data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.ChineseConvertMapper(mode: str = 's2t', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to generate samples whose texts are generated based on -gpt-4-visison and the image.

    +

    Mapper to convert Chinese between Traditional Chinese, Simplified Chinese +and Japanese Kanji.

    -
    -__init__(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +
    +__init__(mode: str = 's2t', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • mode – mode of text generated from images, can be one of -[‘resoning’, ‘description’, ‘conversation’, ‘custom’]

    • -
    • api_key – the API key to authenticate the request.

    • -
    • max_token – the maximum number of tokens to generate. -Default is 500.

    • -
    • temperature – controls the randomness of the output (range -from 0 to 1). Default is 0.

    • -
    • system_prompt – a string prompt used to set the context of a -conversation and provide global guidance or rules for the -gpt4-vision so that it can generate responses in the expected way. -If mode set to custom, the parameter will be used.

    • -
    • user_prompt – a string prompt to guide the generation of -gpt4-vision for each samples. It’s “” in default, which means no -prompt provided.

    • -
    • uers_prompt_key – the key name of fields in samples to store -prompts for each sample. It’s used for set different prompts for -different samples. If it’s none, use prompt in parameter “prompt”. -It’s None in default.

    • -
    • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only generated text in the -final datasets and the original text will be removed. It’s True -in default.

    • -
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of -all images. ‘any’: keep this sample if any images meet the -condition. ‘all’: keep this sample only if all images meet the -condition.

    • +
    • mode

      Choose the mode to convert Chinese:

      +

      s2t: Simplified Chinese to Traditional Chinese,

      +

      t2s: Traditional Chinese to Simplified Chinese,

      +

      s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),

      +

      tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,

      +

      s2hk: Simplified Chinese to Traditional Chinese +(Hong Kong variant),

      +

      hk2s: Traditional Chinese (Hong Kong variant) to Simplified +Chinese,

      +

      s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) +with Taiwanese idiom,

      +

      tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese +with Mainland Chinese idiom,

      +

      t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),

      +

      tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,

      +

      hk2t: Traditional Chinese (Hong Kong variant) to Traditional +Chinese,

      +

      t2hk: Traditional Chinese to Traditional Chinese +(Hong Kong variant),

      +

      t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese +Kanji,

      +

      jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese +Characters,

      +

    • args – extra args

    • kwargs – extra args

    @@ -247,21 +216,21 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.PunctuationNormalizationMapper(*args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.CleanCopyrightMapper(*args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to normalize unicode punctuations to English punctuations in text +

    Mapper to clean copyright comments at the beginning of the text samples.

    -
    -__init__(*args, **kwargs)[source]
    +
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
    @@ -274,25 +243,26 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.RemoveBibliographyMapper(*args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.CleanEmailMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove bibliography at the end of documents in Latex -samples.

    +

    Mapper to clean email in text samples.

    -
    -__init__(*args, **kwargs)[source]
    +
    +__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      +
    • pattern – regular expression pattern to search for within text.

    • +
    • repl – replacement string, default is empty string.

    • args – extra args

    • kwargs – extra args

    @@ -301,25 +271,24 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.SentenceSplitMapper(lang: str = 'en', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.CleanHtmlMapper(*args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to split text samples to sentences.

    +

    Mapper to clean html code in text samples.

    -
    -__init__(lang: str = 'en', *args, **kwargs)[source]
    +
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lang – split sentence of text in which language.

    • args – extra args

    • kwargs – extra args

    @@ -328,71 +297,26 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.VideoSplitBySceneMapper(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.CleanIpMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to cut videos into scene clips.

    -
    -
    -avaliable_detectors = {'AdaptiveDetector': ['window_width', 'min_content_val', 'weights', 'luma_only', 'kernel_size', 'video_manager', 'min_delta_hsv'], 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], 'ThresholdDetector': ['fade_bias', 'add_final_scene', 'method', 'block_size']}
    -
    - +

    Mapper to clean ipv4 and ipv6 address in text samples.

    -
    -__init__(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]
    +
    +__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • detector – Algorithm from scenedetect.detectors. Should be one -of [‘ContentDetector’, ‘ThresholdDetector’, ‘AdaptiveDetector`].

    • -
    • threshold – Threshold passed to the detector.

    • -
    • min_scene_len – Minimum length of any scene.

    • -
    • show_progress – Whether to show progress from scenedetect.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • -
    -
    -
    -
    - -
    -
    -process_single(sample, context=False)[source]
    -

    For sample level, sample –> sample

    -
    -
    Parameters:
    -

    sample – sample to process

    -
    -
    Returns:
    -

    processed sample

    -
    -
    -
    - -
    - -
    -
    -class data_juicer.ops.mapper.CleanIpMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    -

    Bases: Mapper

    -

    Mapper to clean ipv4 and ipv6 address in text samples.

    -
    -
    -__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • pattern – regular expression pattern to search for within text.

    • -
    • repl – replacement string, default is empty string.

    • +
    • pattern – regular expression pattern to search for within text.

    • +
    • repl – replacement string, default is empty string.

    • args – extra args

    • kwargs – extra args

    @@ -436,20 +360,18 @@
    -
    -class data_juicer.ops.mapper.RemoveHeaderMapper(drop_no_head: bool = True, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.ExpandMacroMapper(*args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove headers at the beginning of documents in Latex +

    Mapper to expand macro definitions in the document body of Latex samples.

    -
    -__init__(drop_no_head: bool = True, *args, **kwargs)[source]
    +
    +__init__(*args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • drop_no_head – whether to drop sample texts without -headers.

    • args – extra args

    • kwargs – extra args

    @@ -458,28 +380,27 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.RemoveTableTextMapper(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.FixUnicodeMapper(normalization: str | None = None, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove table texts from text samples.

    -

    Regular expression is used to remove tables in the range of column -number of tables.

    +

    Mapper to fix unicode errors in text samples.

    -
    -__init__(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]
    +
    +__init__(normalization: str | None = None, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_col – The min number of columns of table to remove.

    • -
    • max_col – The max number of columns of table to remove.

    • +
    • normalization – the specified form of Unicode +normalization mode, which can be one of +[‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’], default ‘NFC’.

    • args – extra args

    • kwargs – extra args

    @@ -488,56 +409,104 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.VideoRemoveWatermarkMapper(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.GenerateQAFromExamplesMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', example_num: int[int] = 3, similarity_threshold: float = 0.7, system_prompt: str | None = None, input_template: str | None = None, example_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: Mapper

    -

    Remove the watermarks in videos given regions.

    +

    Mapper to generate question and answer pairs from examples. +You should configure an empty dataset in your yaml config file: +``` +generated_dataset_config:

    +
    +

    type: ‘EmptyFormatter’ # use RayEmptyFormatter when enable ray +length: ${The number of generated samples} +feature_keys: ${text key}

    +
    +

    ``` +The number of samples generated is determined by +the length of the empty dataset.

    +
    +
    +DEFAULT_SYSTEM_PROMPT = '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。注意,新生成的【问题】和【回答】需要满足如下要求:\n1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n'
    +
    + +
    +
    +DEFAULT_INPUT_TEMPLATE = '{}'
    +
    + +
    +
    +DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}'
    +
    + +
    +
    +DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n'
    +
    + +
    +
    +DEFAULT_OUTPUT_PATTERN = '【问题】(.*?)【回答】(.*?)(?=【问题】|$)'
    +
    +
    -
    -__init__(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +
    +__init__(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, seed_file: str = '', example_num: int[int] = 3, similarity_threshold: float = 0.7, system_prompt: str | None = None, input_template: str | None = None, example_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • roi_strings – a given list of regions the watermarks locate. -The format of each can be “x1, y1, x2, y2”, “(x1, y1, x2, y2)”, -or “[x1, y1, x2, y2]”.

    • -
    • roi_type – the roi string type. When the type is ‘pixel’, (x1, -y1), (x2, y2) are the locations of pixels in the top left corner -and the bottom right corner respectively. If the roi_type is -‘ratio’, the coordinates are normalized by wights and heights.

    • -
    • roi_key – the key name of fields in samples to store roi_strings -for each sample. It’s used for set different rois for different -samples. If it’s none, use rois in parameter “roi_strings”. -It’s None in default.

    • -
    • frame_num – the number of frames to be extracted uniformly from -the video to detect the pixels of watermark.

    • -
    • min_frame_threshold – a coodination is considered as the -location of a watermark pixel when it is that in no less -min_frame_threshold frames.

    • -
    • detection_method – the method to detect the pixels of watermark. -If it is ‘pixel_value’, we consider the distribution of pixel -value in each frame. If it is ‘pixel_diversity’, we will consider -the pixel diversity in different frames. The min_frame_threshold -is useless and frame_num must be greater than 1 in -‘pixel_diversity’ mode.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • hf_model – Hugginface model ID.

    • +
    • seed_file – Path to the seed file in chatml format.

    • +
    • example_num – The number of selected examples. +Randomly select N examples from “seed_file” and +put them into prompt as QA examples.

    • +
    • similarity_threshold – The similarity score threshold +between the generated samples and the seed examples. +Range from 0 to 1. Samples with similarity score less than +this threshold will be kept.

    • +
    • system_prompt – System prompt for guiding the generation task.

    • +
    • input_template – Template for building the input prompt. It must +include one placeholder ‘{}’, which will be replaced by +example_num formatted examples defined by example_template.

    • +
    • example_template – Template for formatting one QA example. It +must include one placeholder ‘{}’, which will be replaced by one +formatted qa_pair.

    • +
    • qa_pair_template – Template for formatting a single QA pair +within each example. Must include two placeholders ‘{}’ for the +question and answer.

    • +
    • output_pattern – Regular expression pattern to extract questions +and answers from model response.

    • +
    • enable_vllm – Whether to use vllm for inference acceleration.

    • +
    • model_params – Parameters for initializing the model.

    • +
    • sampling_params – Sampling parameters for text generation. +e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

    • +
    • kwargs – Extra keyword arguments.

    -
    -process_single(sample, context=False)[source]
    +
    +build_input(qa_examples)[source]
    +
    + +
    +
    +parse_output(raw_output)[source]
    +
    + +
    +
    +process_single(sample=None, rank=None)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -552,102 +521,308 @@
    -
    -class data_juicer.ops.mapper.RemoveRepeatSentencesMapper(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.GenerateQAFromTextMapper(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove repeat sentences in text samples.

    +

    Mapper to generate question and answer pairs from text. +Recommended model list: [

    +
    +

    ‘alibaba-pai/pai-llama3-8b-doc2qa’, +‘alibaba-pai/pai-baichuan2-7b-doc2qa’, +‘alibaba-pai/pai-qwen1_5-4b-doc2qa’, +‘alibaba-pai/pai-qwen1_5-7b-doc2qa’, +‘alibaba-pai/pai-qwen1_5-1b8-doc2qa’, +‘alibaba-pai/pai-qwen1_5-0b5-doc2qa’

    +
    +

    ] +These recommended models are all trained with Chinese data +and are suitable for Chinese.

    -
    -__init__(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]
    +
    +__init__(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', *, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • lowercase – Whether to convert sample text to lower case

    • -
    • ignore_special_character – Whether to ignore special -characters when judging repeated sentences. Special characters -are all characters except Chinese characters, letters and -numbers.

    • -
    • min_repeat_sentence_length – Sentences shorter than this -length will not be deduplicated. If ignore_special_character is -set to True, then special characters are not included in this -length.

    • -
    • args – extra args

    • -
    • kwargs – extra args

    • +
    • hf_model – Hugginface model ID.

    • +
    • output_pattern – Regular expression pattern to extract +questions and answers from model response.

    • +
    • enable_vllm – Whether to use vllm for inference acceleration.

    • +
    • model_params – Parameters for initializing the model.

    • +
    • sampling_params – Sampling parameters for text generation, +e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

    • +
    • kwargs – Extra keyword arguments.

    +

    The default data format parsed by this interface is as follows: +Model Input:

    +
    +

    蒙古国的首都是乌兰巴托(Ulaanbaatar) +冰岛的首都是雷克雅未克(Reykjavik)

    +
    +
    +
    Model Output:

    蒙古国的首都是乌兰巴托(Ulaanbaatar) +冰岛的首都是雷克雅未克(Reykjavik) +Human: 请问蒙古国的首都是哪里? +Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 +Human: 冰岛的首都是哪里呢? +Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 +…

    +
    +
    -
    -process_batched(samples)[source]
    +
    +parse_output(raw_output)[source]
    +
    + +
    +
    +process_batched(samples, rank=None)[source]
    -
    -class data_juicer.ops.mapper.ImageDiffusionMapper(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.ImageBlurMapper(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]

    Bases: Mapper

    -

    Generate image by diffusion model

    +

    Mapper to blur images.

    -
    -__init__(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +
    +__init__(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_diffusion – diffusion model name on huggingface to generate -the image.

    • -
    • torch_dtype – the floating point type used to load the diffusion -model. Can be one of [‘fp32’, ‘fp16’, ‘bf16’]

    • -
    • revision – The specific model version to use. It can be a -branch name, a tag name, a commit id, or any identifier allowed -by Git.

    • -
    • strength – Indicates extent to transform the reference image. -Must be between 0 and 1. image is used as a starting point and -more noise is added the higher the strength. The number of -denoising steps depends on the amount of noise initially added. -When strength is 1, added noise is maximum and the denoising -process runs for the full number of iterations specified in -num_inference_steps. A value of 1 essentially ignores image.

    • -
    • guidance_scale – A higher guidance scale value encourages the -model to generate images closely linked to the text prompt at the -expense of lower image quality. Guidance scale is enabled when -guidance_scale > 1.

    • -
    • aug_num – The image number to be produced by stable-diffusion -model.

    • -
    • keep_candidate_mode

      retain strategy for the generated -$caption_num$ candidates.

      -

      ’random_any’: Retain the random one from generated captions

      -
      -
      ’similar_one_simhash’: Retain the generated one that is most

      similar to the original caption

      -
      -
      -

      ’all’: Retain all generated captions by concatenation

      -

    • +
    • p – Probability of the image being blured.

    • +
    • blur_type – Type of blur kernel, including +[‘mean’, ‘box’, ‘gaussian’].

    • +
    • radius – Radius of blur kernel.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    -
    -

    Note

    -

    This is a batched_OP, whose input and output type are -both list. Suppose there are $N$ list of input samples, whose batch -size is $b$, and denote caption_num as $M$. -The number of total samples after generation is $2Nb$ when -keep_original_sample is True and $Nb$ when keep_original_sample is -False. For ‘random_any’ and ‘similar_one_simhash’ mode, -it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True -and $MNb$ when keep_original_sample is False.

    -
    +
    + +
    +
    +process_single(sample, context=False)[source]
    +

    For sample level, sample –> sample

    Parameters:
    -
      -
    • caption_key – the key name of fields in samples to store captions -for each images. It can be a string if there is only one image in -each sample. Otherwise, it should be a list. If it’s none, -ImageDiffusionMapper will produce captions for each images.

    • +

      sample – sample to process

      +
      +
      Returns:
      +

      processed sample

      +
      +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Mapper to generate samples whose texts are generated based on +gpt-4-visison and the image.

    +
    +
    +__init__(mode: str = 'description', api_key: str = '', max_token: int = 500, temperature: float[float] = 1.0, system_prompt: str = '', user_prompt: str = '', user_prompt_key: str | None = None, keep_original_sample: bool = True, any_or_all: str = 'any', *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • mode – mode of text generated from images, can be one of +[‘resoning’, ‘description’, ‘conversation’, ‘custom’]

    • +
    • api_key – the API key to authenticate the request.

    • +
    • max_token – the maximum number of tokens to generate. +Default is 500.

    • +
    • temperature – controls the randomness of the output (range +from 0 to 1). Default is 0.

    • +
    • system_prompt – a string prompt used to set the context of a +conversation and provide global guidance or rules for the +gpt4-vision so that it can generate responses in the expected way. +If mode set to custom, the parameter will be used.

    • +
    • user_prompt – a string prompt to guide the generation of +gpt4-vision for each samples. It’s “” in default, which means no +prompt provided.

    • +
    • uers_prompt_key – the key name of fields in samples to store +prompts for each sample. It’s used for set different prompts for +different samples. If it’s none, use prompt in parameter “prompt”. +It’s None in default.

    • +
    • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only generated text in the +final datasets and the original text will be removed. It’s True +in default.

    • +
    • any_or_all – keep this sample with ‘any’ or ‘all’ strategy of +all images. ‘any’: keep this sample if any images meet the +condition. ‘all’: keep this sample only if all images meet the +condition.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_batched(samples)[source]
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.ImageCaptioningMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Mapper to generate samples whose captions are generated based on +another model and the figure.

    +
    +
    +__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • hf_img2seq – model name on huggingface to generate caption

    • +
    • caption_num – how many candidate captions to generate +for each image

    • +
    • keep_candidate_mode

      retain strategy for the generated +$caption_num$ candidates.

      +

      ’random_any’: Retain the random one from generated captions

      +
      +
      ’similar_one_simhash’: Retain the generated one that is most

      similar to the original caption

      +
      +
      +

      ’all’: Retain all generated captions by concatenation

      +

    • +
    +
    +
    +
    +

    Note

    +

    This is a batched_OP, whose input and output type are +both list. Suppose there are $N$ list of input samples, whose batch +size is $b$, and denote caption_num as $M$. +The number of total samples after generation is $2Nb$ when +keep_original_sample is True and $Nb$ when keep_original_sample is +False. For ‘random_any’ and ‘similar_one_simhash’ mode, +it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True +and $MNb$ when keep_original_sample is False.

    +
    +
    +
    Parameters:
    +
      +
    • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only generated captions in the +final datasets and the original captions will be removed. It’s True +in default.

    • +
    • prompt – a string prompt to guide the generation of blip2 model +for all samples globally. It’s None in default, which means no +prompt provided.

    • +
    • prompt_key – the key name of fields in samples to store prompts +for each sample. It’s used for set different prompts for different +samples. If it’s none, use prompt in parameter “prompt”. It’s None +in default.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_batched(samples, rank=None)[source]
    +
    +

    Note

    +

    This is a batched_OP, whose input and output type are +both list. Suppose there are $N$ input sample list with batch +size as $b$, and denote caption_num as $M$. +the number of total samples after generation is $2Nb$ +for ‘random_any’ and ‘similar_one’ mode, +and $(1+M)Nb$ for ‘all’ mode.

    +
    +
    +
    Parameters:
    +

    samples

    +
    +
    Returns:
    +

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.ImageDiffusionMapper(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Generate image by diffusion model

    +
    +
    +__init__(hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', trust_remote_code: bool = False, torch_dtype: str = 'fp32', revision: str = 'main', strength: float[float] = 0.8, guidance_scale: float = 7.5, aug_num: int[int] = 1, keep_original_sample: bool = True, caption_key: str | None = None, hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • hf_diffusion – diffusion model name on huggingface to generate +the image.

    • +
    • torch_dtype – the floating point type used to load the diffusion +model. Can be one of [‘fp32’, ‘fp16’, ‘bf16’]

    • +
    • revision – The specific model version to use. It can be a +branch name, a tag name, a commit id, or any identifier allowed +by Git.

    • +
    • strength – Indicates extent to transform the reference image. +Must be between 0 and 1. image is used as a starting point and +more noise is added the higher the strength. The number of +denoising steps depends on the amount of noise initially added. +When strength is 1, added noise is maximum and the denoising +process runs for the full number of iterations specified in +num_inference_steps. A value of 1 essentially ignores image.

    • +
    • guidance_scale – A higher guidance scale value encourages the +model to generate images closely linked to the text prompt at the +expense of lower image quality. Guidance scale is enabled when +guidance_scale > 1.

    • +
    • aug_num – The image number to be produced by stable-diffusion +model.

    • +
    • keep_candidate_mode

      retain strategy for the generated +$caption_num$ candidates.

      +

      ’random_any’: Retain the random one from generated captions

      +
      +
      ’similar_one_simhash’: Retain the generated one that is most

      similar to the original caption

      +
      +
      +

      ’all’: Retain all generated captions by concatenation

      +

    • +
    +
    +
    +
    +

    Note

    +

    This is a batched_OP, whose input and output type are +both list. Suppose there are $N$ list of input samples, whose batch +size is $b$, and denote caption_num as $M$. +The number of total samples after generation is $2Nb$ when +keep_original_sample is True and $Nb$ when keep_original_sample is +False. For ‘random_any’ and ‘similar_one_simhash’ mode, +it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True +and $MNb$ when keep_original_sample is False.

    +
    +
    +
    Parameters:
    +
      +
    • caption_key – the key name of fields in samples to store captions +for each images. It can be a string if there is only one image in +each sample. Otherwise, it should be a list. If it’s none, +ImageDiffusionMapper will produce captions for each images.

    • hf_img2seq – model name on huggingface to generate caption if caption_key is None.

    @@ -718,22 +893,21 @@
    -
    -class data_juicer.ops.mapper.VideoFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.ImageTaggingMapper(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]

    Bases: Mapper

    -

    Simple wrapper for FFmpeg video filters.

    +

    Mapper to generate image tags.

    -
    -__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    -

    Initialization method.

    +
    +__init__(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]
    +

    Initialization method. +:param tag_field_name: the field name to store the tags. It’s

    +
    +

    “__dj__image_tags__” in default.

    +
    Parameters:
      -
    • filter_name – ffmpeg video filter name.

    • -
    • filter_kwargs – keyword-arguments passed to ffmpeg filter.

    • -
    • global_args – list-arguments passed to ffmpeg command-line.

    • -
    • capture_stderr – whether to capture stderr.

    • -
    • overwrite_output – whether to overwrite output file.

    • args – extra args

    • kwargs – extra args

    @@ -742,8 +916,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, rank=None, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -757,43 +931,61 @@
    -
    -
    -class data_juicer.ops.mapper.ChineseConvertMapper(mode: str = 's2t', *args, **kwargs)[source]
    -

    Bases: Mapper

    -

    Mapper to convert Chinese between Traditional Chinese, Simplified Chinese -and Japanese Kanji.

    -
    -
    -__init__(mode: str = 's2t', *args, **kwargs)[source]
    -

    Initialization method.

    -
    -
    Parameters:
    -
      -
    • mode

      Choose the mode to convert Chinese:

      -

      s2t: Simplified Chinese to Traditional Chinese,

      -

      t2s: Traditional Chinese to Simplified Chinese,

      -

      s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard),

      -

      tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese,

      -

      s2hk: Simplified Chinese to Traditional Chinese -(Hong Kong variant),

      -

      hk2s: Traditional Chinese (Hong Kong variant) to Simplified -Chinese,

      -

      s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) -with Taiwanese idiom,

      -

      tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese -with Mainland Chinese idiom,

      -

      t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard),

      -

      tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese,

      -

      hk2t: Traditional Chinese (Hong Kong variant) to Traditional -Chinese,

      -

      t2hk: Traditional Chinese to Traditional Chinese -(Hong Kong variant),

      -

      t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese -Kanji,

      -

      jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese -Characters,

      -

    • +
      +
      +class data_juicer.ops.mapper.NlpaugEnMapper(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
      +

      Bases: Mapper

      +

      Mapper to simply augment samples in English based on nlpaug library.

      +
      +
      +__init__(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
      +

      Initialization method. All augmentation methods use default parameters +in default. We recommend you to only use 1-3 augmentation methods at a +time. Otherwise, the semantics of samples might be changed +significantly.

      +
      +
      Parameters:
      +
        +
      • sequential – whether combine all augmentation methods to a +sequence. If it’s True, a sample will be augmented by all opened +augmentation methods sequentially. If it’s False, each opened +augmentation method would generate its augmented samples +independently.

      • +
      • aug_num – number of augmented samples to be generated. If +sequential is True, there will be total aug_num augmented samples +generated. If it’s False, there will be (aug_num * +#opened_aug_method) augmented samples generated.

      • +
      • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only generated texts in the final +datasets and the original texts will be removed. It’s True in +default.

      • +
      • delete_random_word – whether to open the augmentation method of +deleting random words from the original texts. e.g. “I love LLM” +–> “I LLM”

      • +
      • swap_random_word – whether to open the augmentation method of +swapping random contiguous words in the original texts. e.g. “I +love LLM” –> “Love I LLM”

      • +
      • spelling_error_word – whether to open the augmentation method of +simulating the spelling error for words in the original texts. e.g. +“I love LLM” –> “Ai love LLM”

      • +
      • split_random_word – whether to open the augmentation method of +splitting words randomly with whitespaces in the original texts. +e.g. “I love LLM” –> “I love LL M”

      • +
      • keyboard_error_char – whether to open the augmentation method of +simulating the keyboard error for characters in the original texts. +e.g. “I love LLM” –> “I ;ov4 LLM”

      • +
      • ocr_error_char – whether to open the augmentation method of +simulating the OCR error for characters in the original texts. +e.g. “I love LLM” –> “I 10ve LLM”

      • +
      • delete_random_char – whether to open the augmentation method of +deleting random characters from the original texts. e.g. “I love +LLM” –> “I oe LLM”

      • +
      • swap_random_char – whether to open the augmentation method of +swapping random contiguous characters in the original texts. +e.g. “I love LLM” –> “I ovle LLM”

      • +
      • insert_random_char – whether to open the augmentation method of +inserting random characters into the original texts. e.g. “I love +LLM” –> “I ^lKove LLM”

      • args – extra args

      • kwargs – extra args

      @@ -802,8 +994,8 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      @@ -868,88 +1060,71 @@
      -
      -class data_juicer.ops.mapper.OptimizeInstructionMapper(hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine', trust_remote_code: bool = False, system_prompt: str | None = None, enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.OptimizeQAMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to optimize instruction. -Recommended model list: [

      -
      -

      alibaba-pai/Qwen2-1.5B-Instruct-Refine -alibaba-pai/Qwen2-7B-Instruct-Refine

      -
      -

      ]

      -
      -
      -__init__(hf_model: str = 'alibaba-pai/Qwen2-7B-Instruct-Refine', trust_remote_code: bool = False, system_prompt: str | None = None, enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      -

      Initialization method. -:param hf_model: Hugginface model id. -:param trust_remote_code: passed to transformers -:param system_prompt: System prompt for optimize samples. -:param enable_vllm: Whether to use vllm for inference acceleration. -:param tensor_parallel_size: It is only valid when enable_vllm is True.

      -
      -

      The number of GPUs to use for distributed execution with tensor -parallelism.

      -
      -
      -
      Parameters:
      -
        -
      • max_model_len – It is only valid when enable_vllm is True. -Model context length. If unspecified, will be automatically -derived from the model config.

      • -
      • max_num_seqs – It is only valid when enable_vllm is True. -Maximum number of sequences to be processed in a single iteration.

      • -
      • sampling_params – Sampling parameters for text generation. -e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      -
      +

      Mapper to optimize question-answer pairs.

      +
      +
      +DEFAULT_SYSTEM_PROMPT = '请优化输入的问答对,使【问题】和【回答】都更加详细、准确。必须按照以下标记格式,直接输出优化后的问答对:\n【问题】\n优化后的问题\n【回答】\n优化后的回答'
      +
      -
      -
      -process_single(sample=None, rank=None)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      -
      -
      -
      +
      +
      +DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}'
      +
      -
      +
      +
      +DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}'
      +
      + +
      +
      +DEFAULT_OUTPUT_PATTERN = '.*?【问题】\\s*(.*?)\\s*【回答】\\s*(.*)'
      +
      -
      -
      -class data_juicer.ops.mapper.ImageBlurMapper(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to blur images.

      -
      -__init__(p: float = 0.2, blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
      +
      +__init__(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • p – Probability of the image being blured.

      • -
      • blur_type – Type of blur kernel, including -[‘mean’, ‘box’, ‘gaussian’].

      • -
      • radius – Radius of blur kernel.

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • +
      • hf_model – Hugging Face model ID.

      • +
      • system_prompt – System prompt for guiding the optimization task.

      • +
      • input_template – Template for building the input for the model. +Please make sure the template contains one placeholder ‘{}’, which +corresponds to the question and answer pair generated by +param qa_pair_template.

      • +
      • qa_pair_template – Template for formatting the question and +answer pair. Please make sure the template contains two +‘{}’ to format question and answer.

      • +
      • output_pattern – Regular expression pattern to extract question +and answer from model response.

      • +
      • enable_vllm – Whether to use VLLM for inference acceleration.

      • +
      • model_params – Parameters for initializing the model.

      • +
      • sampling_params – Sampling parameters for text generation (e.g., +{‘temperature’: 0.9, ‘top_p’: 0.95}).

      • +
      • kwargs – Extra keyword arguments.

      -
      -process_single(sample, context=False)[source]
      +
      +build_input(sample)[source]
      +
      + +
      +
      +parse_output(raw_output)[source]
      +
      + +
      +
      +process_single(sample=None, rank=None)[source]

      For sample level, sample –> sample

      Parameters:
      @@ -964,47 +1139,52 @@
      -
      -class data_juicer.ops.mapper.CleanCopyrightMapper(*args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to clean copyright comments at the beginning of the text -samples.

      +
      +class data_juicer.ops.mapper.OptimizeQueryMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
      +

      Bases: OptimizeQAMapper

      +

      Mapper to optimize query in question-answer pairs.

      +
      +
      +DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。'
      +
      +
      -
      -__init__(*args, **kwargs)[source]
      -

      Initialization method.

      -
      -
      Parameters:
      -
        -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      +
      +parse_output(raw_output)[source]
      +
      +
      +
      +
      +class data_juicer.ops.mapper.OptimizeResponseMapper(hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', *, system_prompt: str | None = None, input_template: str | None = None, qa_pair_template: str | None = None, output_pattern: str | None = None, enable_vllm: bool = False, model_params: Dict | None = None, sampling_params: Dict | None = None, **kwargs)[source]
      +

      Bases: OptimizeQAMapper

      +

      Mapper to optimize response in question-answer pairs.

      +
      +
      +DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。'
      +
      +
      -
      -process_batched(samples)[source]
      +
      +parse_output(raw_output)[source]
      -
      -class data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.PunctuationNormalizationMapper(*args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to remove non chinese Character in text samples.

      +

      Mapper to normalize unicode punctuations to English punctuations in text +samples.

      -
      -__init__(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]
      +
      +__init__(*args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • keep_alphabet – whether to keep alphabet

      • -
      • keep_number – whether to keep number

      • -
      • keep_punc – whether to keep punctuation

      • args – extra args

      • kwargs – extra args

      @@ -1013,28 +1193,25 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.VideoSplitByKeyFrameMapper(keep_original_sample: bool = True, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveBibliographyMapper(*args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to split video by key frame.

      +

      Mapper to remove bibliography at the end of documents in Latex +samples.

      -
      -__init__(keep_original_sample: bool = True, *args, **kwargs)[source]
      +
      +__init__(*args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only split sample in the -final datasets and the original sample will be removed. It’s True -in default.

      • args – extra args

      • kwargs – extra args

      @@ -1043,31 +1220,28 @@
      -
      -get_split_key_frame(video_key, container)[source]
      -
      - -
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.RemoveSpecificCharsMapper(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveCommentsMapper(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to clean specific chars in text samples.

      +

      Mapper to remove comments in different kinds of documents.

      +

      Only support ‘tex’ for now.

      -
      -__init__(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]
      +
      +__init__(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • chars_to_remove – a list or a string including all -characters that need to be removed from text.

      • +
      • doc_type – Type of document to remove comments.

      • +
      • inline – Whether to remove inline comments.

      • +
      • multiline – Whether to remove multiline comments.

      • args – extra args

      • kwargs – extra args

      @@ -1076,77 +1250,57 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.VideoResizeAspectRatioMapper(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveHeaderMapper(drop_no_head: bool = True, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to resize videos by aspect ratio. -AspectRatio = W / H.

      -
      -
      -STRATEGY = ['decrease', 'increase']
      -
      - +

      Mapper to remove headers at the beginning of documents in Latex +samples.

      -
      -__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]
      +
      +__init__(drop_no_head: bool = True, *args, **kwargs)[source]

      Initialization method.

      -
      Parameters:
      -
        -
      • min_ratio – The minimum aspect ratio to enforce videos with -an aspect ratio below min_ratio will be resized to match -this minimum ratio. The ratio should be provided as a string -in the format “9:21” or “9/21”.

      • -
      • max_ratio – The maximum aspect ratio to enforce videos with -an aspect ratio above max_ratio will be resized to match -this maximum ratio. The ratio should be provided as a string -in the format “21:9” or “21/9”.

      • -
      • strategy – The resizing strategy to apply when adjusting the -video dimensions. It can be either ‘decrease’ to reduce the -dimension or ‘increase’ to enlarge it. Accepted values are -[‘decrease’, ‘increase’].

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      -
      - -
      -
      -process_single(sample)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      +
      Parameters:
      +
        +
      • drop_no_head – whether to drop sample texts without +headers.

      • +
      • args – extra args

      • +
      • kwargs – extra args

      • +
      +
      +
      +process_batched(samples)[source]
      +
      +
      -
      -class data_juicer.ops.mapper.CleanHtmlMapper(*args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveLongWordsMapper(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to clean html code in text samples.

      +

      Mapper to remove long words within a specific range.

      -
      -__init__(*args, **kwargs)[source]
      +
      +__init__(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        +
      • min_len – The min mapper word length in this op, words +will be filtered if their length is below this parameter.

      • +
      • max_len – The max mapper word length in this op, words +will be filtered if their length exceeds this parameter.

      • args – extra args

      • kwargs – extra args

      @@ -1155,27 +1309,32 @@
      -
      -process_batched(samples)[source]
      +
      +should_keep_long_word(word)[source]
      +
      + +
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.WhitespaceNormalizationMapper(*args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to normalize different kinds of whitespaces to whitespace ‘ ‘ (0x20) -in text samples.

      -

      Different kinds of whitespaces can be found here: -https://en.wikipedia.org/wiki/Whitespace_character

      +

      Mapper to remove non chinese Character in text samples.

      -
      -__init__(*args, **kwargs)[source]
      +
      +__init__(keep_alphabet: bool = True, keep_number: bool = True, keep_punc: bool = True, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        +
      • keep_alphabet – whether to keep alphabet

      • +
      • keep_number – whether to keep number

      • +
      • keep_punc – whether to keep punctuation

      • args – extra args

      • kwargs – extra args

      @@ -1184,39 +1343,33 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.VideoTaggingFromFramesMapper(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveRepeatSentencesMapper(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to generate video tags from frames extract by video.

      +

      Mapper to remove repeat sentences in text samples.

      -
      -__init__(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]
      +
      +__init__(lowercase: bool = False, ignore_special_character: bool = True, min_repeat_sentence_length: int = 2, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • frame_sampling_method – sampling method of extracting frame -images from the videos. Should be one of -[“all_keyframes”, “uniform”]. -The former one extracts all key frames (the number of which depends -on the duration of the video) and the latter one extract specified -number of frames uniformly from the video. -Default: “all_keyframes”.

      • -
      • frame_num – the number of frames to be extracted uniformly from -the video. Only works when frame_sampling_method is “uniform”. If -it’s 1, only the middle frame will be extracted. If it’s 2, only -the first and the last frames will be extracted. If it’s larger -than 2, in addition to the first and the last frames, other frames -will be extracted uniformly within the video duration.

      • -
      • tag_field_name – the field name to store the tags. It’s -“__dj__video_frame_tags__” in default.

      • +
      • lowercase – Whether to convert sample text to lower case

      • +
      • ignore_special_character – Whether to ignore special +characters when judging repeated sentences. Special characters +are all characters except Chinese characters, letters and +numbers.

      • +
      • min_repeat_sentence_length – Sentences shorter than this +length will not be deduplicated. If ignore_special_character is +set to True, then special characters are not included in this +length.

      • args – extra args

      • kwargs – extra args

      @@ -1225,37 +1378,26 @@
      -
      -process_single(sample, rank=None, context=False)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      -
      -
      -
      +
      +process_batched(samples)[source]
      +
      -
      -class data_juicer.ops.mapper.RemoveCommentsMapper(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveSpecificCharsMapper(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to remove comments in different kinds of documents.

      -

      Only support ‘tex’ for now.

      +

      Mapper to clean specific chars in text samples.

      -
      -__init__(doc_type: str | List[str] = 'tex', inline: bool = True, multiline: bool = True, *args, **kwargs)[source]
      +
      +__init__(chars_to_remove: str | List[str] = '◆●■►▼▲▴∆▻▷❖♡□', *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • doc_type – Type of document to remove comments.

      • -
      • inline – Whether to remove inline comments.

      • -
      • multiline – Whether to remove multiline comments.

      • +
      • chars_to_remove – a list or a string including all +characters that need to be removed from text.

      • args – extra args

      • kwargs – extra args

      @@ -1264,25 +1406,28 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.ExpandMacroMapper(*args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveTableTextMapper(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to expand macro definitions in the document body of Latex -samples.

      +

      Mapper to remove table texts from text samples.

      +

      Regular expression is used to remove tables in the range of column +number of tables.

      -
      -__init__(*args, **kwargs)[source]
      +
      +__init__(min_col: int[int] = 2, max_col: int[int] = 20, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        +
      • min_col – The min number of columns of table to remove.

      • +
      • max_col – The max number of columns of table to remove.

      • args – extra args

      • kwargs – extra args

      @@ -1291,145 +1436,62 @@
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples)[source]
      -
      -class data_juicer.ops.mapper.ExtractQAMapper(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', trust_remote_code: bool = False, pattern: str | None = None, qa_format: str = 'chatml', enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to extract question and answer pair from text samples. -Recommended model list: [

      -
      -

      ‘alibaba-pai/pai-llama3-8b-doc2qa’, -‘alibaba-pai/pai-baichuan2-7b-doc2qa’, -‘alibaba-pai/pai-qwen1_5-4b-doc2qa’, -‘alibaba-pai/pai-qwen1_5-7b-doc2qa’, -‘alibaba-pai/pai-qwen1_5-1b8-doc2qa’, -‘alibaba-pai/pai-qwen1_5-0b5-doc2qa’

      -
      -

      ] -These recommended models are all trained with Chinese data -and are suitable for Chinese.

      +

      Mapper to remove words with incorrect substrings.

      -
      -__init__(hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', trust_remote_code: bool = False, pattern: str | None = None, qa_format: str = 'chatml', enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      -

      Initialization method. -:param hf_model: Hugginface model id. -:param trust_remote_code: passed to transformers -:param pattern: regular expression pattern to search for within text. -:param qa_format: Output format of question and answer pair. -:param enable_vllm: Whether to use vllm for inference acceleration. -:param tensor_parallel_size: It is only valid when enable_vllm is True.

      -
      -

      The number of GPUs to use for distributed execution with tensor -parallelism.

      -
      +
      +__init__(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]
      +

      Initialization method.

      Parameters:
        -
      • max_model_len – It is only valid when enable_vllm is True. -Model context length. If unspecified, will be automatically -derived from the model config.

      • -
      • max_num_seqs – It is only valid when enable_vllm is True. -Maximum number of sequences to be processed in a single iteration.

      • -
      • sampling_params – Sampling parameters for text generation. -e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

      • +
      • lang – sample in which language

      • +
      • tokenization – whether to use model to tokenize documents

      • +
      • substrings – The incorrect substrings in words.

      • args – extra args

      • kwargs – extra args

      -

      The default data format parsed by this interface is as follows: -Model Input:

      -
      -

      蒙古国的首都是乌兰巴托(Ulaanbaatar) -冰岛的首都是雷克雅未克(Reykjavik)

      -
      -
      -
      Model Output:

      蒙古国的首都是乌兰巴托(Ulaanbaatar) -冰岛的首都是雷克雅未克(Reykjavik) -Human: 请问蒙古国的首都是哪里? -Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 -Human: 冰岛的首都是哪里呢? -Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 -…

      -
      -
      -
      -process_single(sample, rank=None)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      -
      -
      -
      +
      +should_keep_word_with_incorrect_substrings(word, substrings)[source]
      +
      + +
      +
      +process_batched(samples)[source]
      +
      -
      -class data_juicer.ops.mapper.ImageCaptioningMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.ReplaceContentMapper(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to generate samples whose captions are generated based on -another model and the figure.

      +

      Mapper to replace all content in the text that matches +a specific regular expression pattern with a designated +replacement string.

      -
      -__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, *args, **kwargs)[source]
      +
      +__init__(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • hf_img2seq – model name on huggingface to generate caption

      • -
      • caption_num – how many candidate captions to generate -for each image

      • -
      • keep_candidate_mode

        retain strategy for the generated -$caption_num$ candidates.

        -

        ’random_any’: Retain the random one from generated captions

        -
        -
        ’similar_one_simhash’: Retain the generated one that is most

        similar to the original caption

        -
        -
        -

        ’all’: Retain all generated captions by concatenation

        -

      • -
      -
      -
      -
      -

      Note

      -

      This is a batched_OP, whose input and output type are -both list. Suppose there are $N$ list of input samples, whose batch -size is $b$, and denote caption_num as $M$. -The number of total samples after generation is $2Nb$ when -keep_original_sample is True and $Nb$ when keep_original_sample is -False. For ‘random_any’ and ‘similar_one_simhash’ mode, -it’s $(1+M)Nb$ for ‘all’ mode when keep_original_sample is True -and $MNb$ when keep_original_sample is False.

      -
      -
      -
      Parameters:
      -
        -
      • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only generated captions in the -final datasets and the original captions will be removed. It’s True -in default.

      • -
      • prompt – a string prompt to guide the generation of blip2 model -for all samples globally. It’s None in default, which means no -prompt provided.

      • -
      • prompt_key – the key name of fields in samples to store prompts -for each sample. It’s used for set different prompts for different -samples. If it’s none, use prompt in parameter “prompt”. It’s None -in default.

      • +
      • pattern – regular expression pattern(s) to search for within text

      • +
      • repl – replacement string(s), default is empty string

      • args – extra args

      • kwargs – extra args

      @@ -1438,44 +1500,56 @@
      -
      -process_batched(samples, rank=None)[source]
      -
      -

      Note

      -

      This is a batched_OP, whose input and output type are -both list. Suppose there are $N$ input sample list with batch -size as $b$, and denote caption_num as $M$. -the number of total samples after generation is $2Nb$ -for ‘random_any’ and ‘similar_one’ mode, -and $(1+M)Nb$ for ‘all’ mode.

      -
      +
      +process_batched(samples)[source]
      +
      + +
      + +
      +
      +class data_juicer.ops.mapper.SentenceSplitMapper(lang: str = 'en', *args, **kwargs)[source]
      +

      Bases: Mapper

      +

      Mapper to split text samples to sentences.

      +
      +
      +__init__(lang: str = 'en', *args, **kwargs)[source]
      +

      Initialization method.

      Parameters:
      -

      samples

      -
      -
      Returns:
      -

      +
        +
      • lang – split sentence of text in which language.

      • +
      • args – extra args

      • +
      • kwargs – extra args

      • +
      +
      +
      +process_batched(samples)[source]
      +
      +
      -
      -class data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.VideoCaptioningFromAudioMapper(keep_original_sample: bool = True, *args, **kwargs)[source]

      Bases: Mapper

      -

      Mapper to remove words with incorrect substrings.

      +

      Mapper to caption a video according to its audio streams based on +Qwen-Audio model.

      -
      -__init__(lang: str = 'en', tokenization: bool = False, substrings: List[str] | None = None, *args, **kwargs)[source]
      +
      +__init__(keep_original_sample: bool = True, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • lang – sample in which language

      • -
      • tokenization – whether to use model to tokenize documents

      • -
      • substrings – The incorrect substrings in words.

      • +
      • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only captioned sample in the +final datasets and the original sample will be removed. It’s True +in default.

      • args – extra args

      • kwargs – extra args

      @@ -1484,32 +1558,27 @@
      -
      -should_keep_word_with_incorrect_substrings(word, substrings)[source]
      -
      - -
      -
      -process_batched(samples)[source]
      +
      +process_batched(samples, rank=None)[source]
      -
      -class data_juicer.ops.mapper.VideoCaptioningFromVideoMapper(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
      +
      +class data_juicer.ops.mapper.VideoCaptioningFromFramesMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

      Bases: Mapper

      Mapper to generate samples whose captions are generated based on -a video-to-text model and sampled video frame.

      +an image-to-text model and sampled video frames. Captions from different +frames will be concatenated to a single string.

      -
      -__init__(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
      +
      +__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

      Initialization method.

      Parameters:
        -
      • hf_video_blip – video-blip model name on huggingface -to generate caption

      • +
      • hf_img2seq – model name on huggingface to generate caption

      • caption_num – how many candidate captions to generate for each video

      • keep_candidate_mode

        retain strategy for the generated @@ -1542,7 +1611,7 @@ it’s set to False, there will be only generated captions in the final datasets and the original captions will be removed. It’s True in default.

      • -
      • prompt – a string prompt to guide the generation of video-blip +

      • prompt – a string prompt to guide the generation of image-to-text model for all samples globally. It’s None in default, which means no prompt provided.

      • prompt_key – the key name of fields in samples to store prompts @@ -1572,8 +1641,8 @@

      -
      -process_batched(samples, rank=None, context=False)[source]
      +
      +process_batched(samples, rank=None, context=False)[source]
      Parameters:

      samples

      @@ -1583,316 +1652,67 @@
      -

      Note

      -

      This is a batched_OP, whose the input and output type are -both list. Suppose there are $N$ input sample list with batch -size as $b$, and denote caption_num as $M$. -the number of total samples after generation is $2Nb$ -for ‘random_any’ and ‘similar_one’ mode, -and $(1+M)Nb$ for ‘all’ mode.

      -
      -
      - -
      - -
      -
      -class data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to generate video captions by summarizing several kinds of generated -texts (captions from video/audio/frames, tags from audio/frames, …)

      -
      -
      -__init__(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]
      -

      Initialization method.

      -
      -
      Parameters:
      -
        -
      • hf_summarizer – the summarizer model used to summarize texts -generated by other methods.

      • -
      • consider_video_caption_from_video – whether to consider the video -caption generated from video directly in the summarization process. -Default: True.

      • -
      • consider_video_caption_from_audio – whether to consider the video -caption generated from audio streams in the video in the -summarization process. Default: True.

      • -
      • consider_video_caption_from_frames – whether to consider the -video caption generated from sampled frames from the video in the -summarization process. Default: True.

      • -
      • consider_video_tags_from_audio – whether to consider the video -tags generated from audio streams in the video in the summarization -process. Default: True.

      • -
      • consider_video_tags_from_frames – whether to consider the video -tags generated from sampled frames from the video in the -summarization process. Default: True.

      • -
      • vid_cap_from_vid_args – the arg dict for video captioning from -video directly with keys are the arg names and values are the arg -values. Default: None.

      • -
      • vid_cap_from_frm_args – the arg dict for video captioning from -sampled frames from the video with keys are the arg names and -values are the arg values. Default: None.

      • -
      • vid_tag_from_aud_args – the arg dict for video tagging from audio -streams in the video with keys are the arg names and values are the -arg values. Default: None.

      • -
      • vid_tag_from_frm_args – the arg dict for video tagging from -sampled frames from the video with keys are the arg names and -values are the arg values. Default: None.

      • -
      • keep_tag_num – max number N of tags from sampled frames to keep. -Too many tags might bring negative influence to summarized text, so -we consider to only keep the N most frequent tags. Default: 5.

      • -
      • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only summarized captions in the -final datasets and the original captions will be removed. It’s True -in default.

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      -
      - -
      -
      -process_batched(samples, rank=None)[source]
      -
      - -
      - -
      -
      -class data_juicer.ops.mapper.GenerateInstructionMapper(hf_model: str = 'Qwen/Qwen-7B-Chat', seed_file: str = '', instruct_num: int[int] = 3, trust_remote_code: bool = False, similarity_threshold: float = 0.7, prompt_template: str | None = None, qa_pair_template: str | None = None, example_template: str | None = None, qa_extraction_pattern: str | None = None, enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to generate new instruction text data. -You should configure an empty dataset in your yaml config file: -``` -generated_dataset_config:

      -
      -

      type: ‘EmptyFormatter’ # use RayEmptyFormatter when enable ray -length: ${The number of generated samples} -feature_keys: ${text key}

      -
      -

      ``` -The number of samples generated is determined by -the length of the empty dataset.

      -
      -
      -__init__(hf_model: str = 'Qwen/Qwen-7B-Chat', seed_file: str = '', instruct_num: int[int] = 3, trust_remote_code: bool = False, similarity_threshold: float = 0.7, prompt_template: str | None = None, qa_pair_template: str | None = None, example_template: str | None = None, qa_extraction_pattern: str | None = None, enable_vllm: bool = True, tensor_parallel_size: int | None = None, max_model_len: int | None = None, max_num_seqs: int = 256, sampling_params: Dict = {}, *args, **kwargs)[source]
      -
      -

      Initialization method.

      -
      -
      param hf_model:
      -

      Hugginface model id.

      -
      -
      param seed_file:
      -

      Seed file path, chatml format.

      -
      -
      param instruct_num:
      -

      The number of instruction samples. -Randomly select N samples from “seed_file” and -put them into prompt as instruction samples.

      -
      -
      param trust_remote_code:
      -

      passed to transformers

      -
      -
      param similarity_threshold:
      -

      The similarity score threshold -between the generated samples and the seed samples. -Range from 0 to 1. Samples with similarity score less than -this threshold will be kept.

      -
      -
      param prompt_template:
      -

      Prompt template for generate samples. -Please make sure the template contains “{augmented_data}”, -which corresponds to the augmented samples.

      -
      -
      param qa_pair_template:
      -

      Prompt template for generate question -and answer pair description. Please make sure the template -contains two “{}” to format question and answer. -Default: ‘【问题】

      -
      -
      -
      -

      {} -【回答】 -{} -‘.

      -
      -
      -
      param example_template:
      -

      Prompt template for generate examples. -Please make sure the template contains “{qa_pairs}”, which -corresponds to the question and answer pair description -generated by param qa_pair_template. -Default: ‘

      -
      -
      -
      -

      如下是一条示例数据:

      -
      -
      {qa_pairs}’
      -
      param qa_extraction_pattern:
      -

      Regular expression pattern for parsing -question and answer from model response.

      -
      -
      param enable_vllm:
      -

      Whether to use vllm for inference acceleration.

      -
      -
      param tensor_parallel_size:
      -

      It is only valid when enable_vllm is True. -The number of GPUs to use for distributed execution with tensor -parallelism.

      -
      -
      param max_model_len:
      -

      It is only valid when enable_vllm is True. -Model context length. If unspecified, will be automatically -derived from the model config.

      -
      -
      param max_num_seqs:
      -

      It is only valid when enable_vllm is True. -Maximum number of sequences to be processed in a single iteration.

      -
      -
      param sampling_params:
      -

      Sampling parameters for text generation. -e.g {‘temperature’: 0.9, ‘top_p’: 0.95}

      -
      -
      param args:
      -

      extra args

      -
      -
      param kwargs:
      -

      extra args

      -
      -
      -
      -
      -
      - -
      -
      -load_seed_qa_samples(seed_file)[source]
      -

      Load QA pairs from chatml format file.

      -
      - -
      -
      -build_prompt(qa_samples, prompt_template)[source]
      -
      - -
      -
      -parse_chatml_str(input_str)[source]
      -
      - -
      -
      -parse_response(response_str)[source]
      -
      - -
      -
      -max_rouge_l_score(reference, candidates)[source]
      -
      - -
      -
      -process_single(sample=None, rank=None)[source]
      -

      For sample level, sample –> sample

      -
      -
      Parameters:
      -

      sample – sample to process

      -
      -
      Returns:
      -

      processed sample

      -
      -
      -
      - -
      - -
      -
      -class data_juicer.ops.mapper.FixUnicodeMapper(normalization: str | None = None, *args, **kwargs)[source]
      -

      Bases: Mapper

      -

      Mapper to fix unicode errors in text samples.

      -
      -
      -__init__(normalization: str | None = None, *args, **kwargs)[source]
      -

      Initialization method.

      -
      -
      Parameters:
      -
        -
      • normalization – the specified form of Unicode -normalization mode, which can be one of -[‘NFC’, ‘NFKC’, ‘NFD’, and ‘NFKD’], default ‘NFC’.

      • -
      • args – extra args

      • -
      • kwargs – extra args

      • -
      -
      -
      +

      Note

      +

      This is a batched_OP, whose the input and output type are +both list. Suppose there are $N$ input sample list with batch +size as $b$, and denote caption_num as $M$. +the number of total samples after generation is $2Nb$ +for ‘random_any’ and ‘similar_one’ mode, +and $(1+M)Nb$ for ‘all’ mode.

      +
    -
    -
    -process_batched(samples)[source]
    -
    -
    -
    -class data_juicer.ops.mapper.NlpaugEnMapper(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to simply augment samples in English based on nlpaug library.

    +

    Mapper to generate video captions by summarizing several kinds of generated +texts (captions from video/audio/frames, tags from audio/frames, …)

    -
    -__init__(sequential: bool = False, aug_num: int[int] = 1, keep_original_sample: bool = True, delete_random_word: bool = False, swap_random_word: bool = False, spelling_error_word: bool = False, split_random_word: bool = False, keyboard_error_char: bool = False, ocr_error_char: bool = False, delete_random_char: bool = False, swap_random_char: bool = False, insert_random_char: bool = False, *args, **kwargs)[source]
    -

    Initialization method. All augmentation methods use default parameters -in default. We recommend you to only use 1-3 augmentation methods at a -time. Otherwise, the semantics of samples might be changed -significantly.

    +
    +__init__(hf_summarizer: str | None = None, trust_remote_code: bool = False, consider_video_caption_from_video: bool = True, consider_video_caption_from_audio: bool = True, consider_video_caption_from_frames: bool = True, consider_video_tags_from_audio: bool = True, consider_video_tags_from_frames: bool = True, vid_cap_from_vid_args: Dict | None = None, vid_cap_from_frm_args: Dict | None = None, vid_tag_from_aud_args: Dict | None = None, vid_tag_from_frm_args: Dict | None = None, keep_tag_num: int[int] = 5, keep_original_sample: bool = True, *args, **kwargs)[source]
    +

    Initialization method.

    Parameters:
      -
    • sequential – whether combine all augmentation methods to a -sequence. If it’s True, a sample will be augmented by all opened -augmentation methods sequentially. If it’s False, each opened -augmentation method would generate its augmented samples -independently.

    • -
    • aug_num – number of augmented samples to be generated. If -sequential is True, there will be total aug_num augmented samples -generated. If it’s False, there will be (aug_num * -#opened_aug_method) augmented samples generated.

    • +
    • hf_summarizer – the summarizer model used to summarize texts +generated by other methods.

    • +
    • consider_video_caption_from_video – whether to consider the video +caption generated from video directly in the summarization process. +Default: True.

    • +
    • consider_video_caption_from_audio – whether to consider the video +caption generated from audio streams in the video in the +summarization process. Default: True.

    • +
    • consider_video_caption_from_frames – whether to consider the +video caption generated from sampled frames from the video in the +summarization process. Default: True.

    • +
    • consider_video_tags_from_audio – whether to consider the video +tags generated from audio streams in the video in the summarization +process. Default: True.

    • +
    • consider_video_tags_from_frames – whether to consider the video +tags generated from sampled frames from the video in the +summarization process. Default: True.

    • +
    • vid_cap_from_vid_args – the arg dict for video captioning from +video directly with keys are the arg names and values are the arg +values. Default: None.

    • +
    • vid_cap_from_frm_args – the arg dict for video captioning from +sampled frames from the video with keys are the arg names and +values are the arg values. Default: None.

    • +
    • vid_tag_from_aud_args – the arg dict for video tagging from audio +streams in the video with keys are the arg names and values are the +arg values. Default: None.

    • +
    • vid_tag_from_frm_args – the arg dict for video tagging from +sampled frames from the video with keys are the arg names and +values are the arg values. Default: None.

    • +
    • keep_tag_num – max number N of tags from sampled frames to keep. +Too many tags might bring negative influence to summarized text, so +we consider to only keep the N most frequent tags. Default: 5.

    • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only generated texts in the final -datasets and the original texts will be removed. It’s True in -default.

    • -
    • delete_random_word – whether to open the augmentation method of -deleting random words from the original texts. e.g. “I love LLM” -–> “I LLM”

    • -
    • swap_random_word – whether to open the augmentation method of -swapping random contiguous words in the original texts. e.g. “I -love LLM” –> “Love I LLM”

    • -
    • spelling_error_word – whether to open the augmentation method of -simulating the spelling error for words in the original texts. e.g. -“I love LLM” –> “Ai love LLM”

    • -
    • split_random_word – whether to open the augmentation method of -splitting words randomly with whitespaces in the original texts. -e.g. “I love LLM” –> “I love LL M”

    • -
    • keyboard_error_char – whether to open the augmentation method of -simulating the keyboard error for characters in the original texts. -e.g. “I love LLM” –> “I ;ov4 LLM”

    • -
    • ocr_error_char – whether to open the augmentation method of -simulating the OCR error for characters in the original texts. -e.g. “I love LLM” –> “I 10ve LLM”

    • -
    • delete_random_char – whether to open the augmentation method of -deleting random characters from the original texts. e.g. “I love -LLM” –> “I oe LLM”

    • -
    • swap_random_char – whether to open the augmentation method of -swapping random contiguous characters in the original texts. -e.g. “I love LLM” –> “I ovle LLM”

    • -
    • insert_random_char – whether to open the augmentation method of -inserting random characters into the original texts. e.g. “I love -LLM” –> “I ^lKove LLM”

    • +it’s set to False, there will be only summarized captions in the +final datasets and the original captions will be removed. It’s True +in default.

    • args – extra args

    • kwargs – extra args

    @@ -1901,27 +1721,27 @@
    -
    -process_batched(samples)[source]
    +
    +process_batched(samples, rank=None)[source]
    -
    -class data_juicer.ops.mapper.VideoCaptioningFromFramesMapper(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoCaptioningFromVideoMapper(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    Mapper to generate samples whose captions are generated based on -an image-to-text model and sampled video frames. Captions from different -frames will be concatenated to a single string.

    +a video-to-text model and sampled video frame.

    -
    -__init__(hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]
    +
    +__init__(hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', trust_remote_code: bool = False, caption_num: int[int] = 1, keep_candidate_mode: str = 'random_any', keep_original_sample: bool = True, prompt: str | None = None, prompt_key: str | None = None, frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, horizontal_flip: bool = False, vertical_flip: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • hf_img2seq – model name on huggingface to generate caption

    • +
    • hf_video_blip – video-blip model name on huggingface +to generate caption

    • caption_num – how many candidate captions to generate for each video

    • keep_candidate_mode

      retain strategy for the generated @@ -1954,7 +1774,7 @@ it’s set to False, there will be only generated captions in the final datasets and the original captions will be removed. It’s True in default.

    • -
    • prompt – a string prompt to guide the generation of image-to-text +

    • prompt – a string prompt to guide the generation of video-blip model for all samples globally. It’s None in default, which means no prompt provided.

    • prompt_key – the key name of fields in samples to store prompts @@ -1984,8 +1804,8 @@

    -
    -process_batched(samples, rank=None, context=False)[source]
    +
    +process_batched(samples, rank=None, context=False)[source]
    Parameters:

    samples

    @@ -2008,21 +1828,172 @@
    -
    -class data_juicer.ops.mapper.RemoveLongWordsMapper(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Simple wrapper for FFmpeg video filters.

    +
    +
    +__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • filter_name – ffmpeg video filter name.

    • +
    • filter_kwargs – keyword-arguments passed to ffmpeg filter.

    • +
    • global_args – list-arguments passed to ffmpeg command-line.

    • +
    • capture_stderr – whether to capture stderr.

    • +
    • overwrite_output – whether to overwrite output file.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.VideoFaceBlurMapper(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Mapper to blur faces detected in videos.

    +
    +
    +__init__(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • cv_classifier – OpenCV classifier path for face detection. +By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • +
    • blur_type – Type of blur kernel, including +[‘mean’, ‘box’, ‘gaussian’].

    • +
    • radius – Radius of blur kernel.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_single(sample, context=False)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.VideoRemoveWatermarkMapper(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +

    Bases: Mapper

    +

    Remove the watermarks in videos given regions.

    +
    +
    +__init__(roi_strings: List[str] = ['0,0,0.1,0.1'], roi_type: str = 'ratio', roi_key: str | None = None, frame_num: int[int] = 10, min_frame_threshold: int[int] = 7, detection_method: str = 'pixel_value', *args, **kwargs)[source]
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • roi_strings – a given list of regions the watermarks locate. +The format of each can be “x1, y1, x2, y2”, “(x1, y1, x2, y2)”, +or “[x1, y1, x2, y2]”.

    • +
    • roi_type – the roi string type. When the type is ‘pixel’, (x1, +y1), (x2, y2) are the locations of pixels in the top left corner +and the bottom right corner respectively. If the roi_type is +‘ratio’, the coordinates are normalized by wights and heights.

    • +
    • roi_key – the key name of fields in samples to store roi_strings +for each sample. It’s used for set different rois for different +samples. If it’s none, use rois in parameter “roi_strings”. +It’s None in default.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video to detect the pixels of watermark.

    • +
    • min_frame_threshold – a coodination is considered as the +location of a watermark pixel when it is that in no less +min_frame_threshold frames.

    • +
    • detection_method – the method to detect the pixels of watermark. +If it is ‘pixel_value’, we consider the distribution of pixel +value in each frame. If it is ‘pixel_diversity’, we will consider +the pixel diversity in different frames. The min_frame_threshold +is useless and frame_num must be greater than 1 in +‘pixel_diversity’ mode.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process_single(sample, context=False)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.mapper.VideoResizeAspectRatioMapper(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to remove long words within a specific range.

    +

    Mapper to resize videos by aspect ratio. +AspectRatio = W / H.

    +
    +
    +STRATEGY = ['decrease', 'increase']
    +
    +
    -
    -__init__(min_len: int = 1, max_len: int = 9223372036854775807, *args, **kwargs)[source]
    +
    +__init__(min_ratio: str = '9/21', max_ratio: str = '21/9', strategy: str = 'increase', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • min_len – The min mapper word length in this op, words -will be filtered if their length is below this parameter.

    • -
    • max_len – The max mapper word length in this op, words -will be filtered if their length exceeds this parameter.

    • +
    • min_ratio – The minimum aspect ratio to enforce videos with +an aspect ratio below min_ratio will be resized to match +this minimum ratio. The ratio should be provided as a string +in the format “9:21” or “9/21”.

    • +
    • max_ratio – The maximum aspect ratio to enforce videos with +an aspect ratio above max_ratio will be resized to match +this maximum ratio. The ratio should be provided as a string +in the format “21:9” or “21/9”.

    • +
    • strategy – The resizing strategy to apply when adjusting the +video dimensions. It can be either ‘decrease’ to reduce the +dimension or ‘increase’ to enlarge it. Accepted values are +[‘decrease’, ‘increase’].

    • args – extra args

    • kwargs – extra args

    @@ -2031,14 +2002,18 @@
    -
    -should_keep_long_word(word)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    +
    +process_single(sample)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    @@ -2089,19 +2064,25 @@
    -
    -class data_juicer.ops.mapper.CleanEmailMapper(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoSplitByDurationMapper(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to clean email in text samples.

    +

    Mapper to split video by duration.

    -
    -__init__(pattern: str | None = None, repl: str = '', *args, **kwargs)[source]
    +
    +__init__(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • pattern – regular expression pattern to search for within text.

    • -
    • repl – replacement string, default is empty string.

    • +
    • split_duration – duration of each video split in seconds.

    • +
    • min_last_split_duration – The minimum allowable duration in +seconds for the last video split. If the duration of the last +split is less than this value, it will be discarded.

    • +
    • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only cut sample in the +final datasets and the original sample will be removed. It’s True +in default.

    • args – extra args

    • kwargs – extra args

    @@ -2110,28 +2091,33 @@
    -
    -process_batched(samples)[source]
    +
    +split_videos_by_duration(video_key, container)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.ReplaceContentMapper(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoSplitByKeyFrameMapper(keep_original_sample: bool = True, *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to replace all content in the text that matches -a specific regular expression pattern with a designated -replacement string.

    +

    Mapper to split video by key frame.

    -
    -__init__(pattern: str | List[str] | None = None, repl: str | List[str] = '', *args, **kwargs)[source]
    +
    +__init__(keep_original_sample: bool = True, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • pattern – regular expression pattern(s) to search for within text

    • -
    • repl – replacement string(s), default is empty string

    • +
    • keep_original_sample – whether to keep the original sample. If +it’s set to False, there will be only split sample in the +final datasets and the original sample will be removed. It’s True +in default.

    • args – extra args

    • kwargs – extra args

    @@ -2140,29 +2126,39 @@
    -
    -process_batched(samples)[source]
    +
    +get_split_key_frame(video_key, container)[source]
    +
    + +
    +
    +process_batched(samples)[source]
    -
    -class data_juicer.ops.mapper.AudioFFmpegWrappedMapper(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoSplitBySceneMapper(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]

    Bases: Mapper

    -

    Simple wrapper for FFmpeg audio filters.

    +

    Mapper to cut videos into scene clips.

    +
    +
    +avaliable_detectors = {'AdaptiveDetector': ['window_width', 'min_content_val', 'weights', 'luma_only', 'kernel_size', 'video_manager', 'min_delta_hsv'], 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], 'ThresholdDetector': ['fade_bias', 'add_final_scene', 'method', 'block_size']}
    +
    +
    -
    -__init__(filter_name: str | None = None, filter_kwargs: Dict | None = None, global_args: List[str] | None = None, capture_stderr: bool = True, overwrite_output: bool = True, *args, **kwargs)[source]
    +
    +__init__(detector: str = 'ContentDetector', threshold: float[float] = 27.0, min_scene_len: int[int] = 15, show_progress: bool = False, *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • filter_name – ffmpeg audio filter name.

    • -
    • filter_kwargs – keyword-arguments passed to ffmpeg filter.

    • -
    • global_args – list-arguments passed to ffmpeg command-line.

    • -
    • capture_stderr – whether to capture stderr.

    • -
    • overwrite_output – whether to overwrite output file.

    • +
    • detector – Algorithm from scenedetect.detectors. Should be one +of [‘ContentDetector’, ‘ThresholdDetector’, ‘AdaptiveDetector`].

    • +
    • threshold – Threshold passed to the detector.

    • +
    • min_scene_len – Minimum length of any scene.

    • +
    • show_progress – Whether to show progress from scenedetect.

    • args – extra args

    • kwargs – extra args

    @@ -2171,8 +2167,8 @@
    -
    -process_single(sample)[source]
    +
    +process_single(sample, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2187,25 +2183,22 @@
    -
    -class data_juicer.ops.mapper.VideoSplitByDurationMapper(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoTaggingFromAudioMapper(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to split video by duration.

    +

    Mapper to generate video tags from audio streams extracted by video +using the Audio Spectrogram Transformer.

    -
    -__init__(split_duration: float = 10, min_last_split_duration: float = 0, keep_original_sample: bool = True, *args, **kwargs)[source]
    +
    +__init__(hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', trust_remote_code: bool = False, tag_field_name: str = '__dj__video_audio_tags__', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • split_duration – duration of each video split in seconds.

    • -
    • min_last_split_duration – The minimum allowable duration in -seconds for the last video split. If the duration of the last -split is less than this value, it will be discarded.

    • -
    • keep_original_sample – whether to keep the original sample. If -it’s set to False, there will be only cut sample in the -final datasets and the original sample will be removed. It’s True -in default.

    • +
    • hf_ast – path to the HF model to tag from audios.

    • +
    • trust_remote_code – whether to trust the remote code of HF models

    • +
    • tag_field_name – the field name to store the tags. It’s +“__dj__video_audio_tags__” in default.

    • args – extra args

    • kwargs – extra args

    @@ -2214,34 +2207,48 @@
    -
    -split_videos_by_duration(video_key, container)[source]
    -
    - -
    -
    -process_batched(samples)[source]
    -
    +
    +process_single(sample, rank=None)[source]
    +

    For sample level, sample –> sample

    +
    +
    Parameters:
    +

    sample – sample to process

    +
    +
    Returns:
    +

    processed sample

    +
    +
    +
    -
    -class data_juicer.ops.mapper.VideoFaceBlurMapper(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.VideoTaggingFromFramesMapper(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to blur faces detected in videos.

    +

    Mapper to generate video tags from frames extract by video.

    -
    -__init__(cv_classifier: str = '', blur_type: str = 'gaussian', radius: float = 2, *args, **kwargs)[source]
    +
    +__init__(frame_sampling_method: str = 'all_keyframes', frame_num: int[int] = 3, tag_field_name: str = '__dj__video_frame_tags__', *args, **kwargs)[source]

    Initialization method.

    Parameters:
      -
    • cv_classifier – OpenCV classifier path for face detection. -By default, we will use ‘haarcascade_frontalface_alt.xml’.

    • -
    • blur_type – Type of blur kernel, including -[‘mean’, ‘box’, ‘gaussian’].

    • -
    • radius – Radius of blur kernel.

    • +
    • frame_sampling_method – sampling method of extracting frame +images from the videos. Should be one of +[“all_keyframes”, “uniform”]. +The former one extracts all key frames (the number of which depends +on the duration of the video) and the latter one extract specified +number of frames uniformly from the video. +Default: “all_keyframes”.

    • +
    • frame_num – the number of frames to be extracted uniformly from +the video. Only works when frame_sampling_method is “uniform”. If +it’s 1, only the middle frame will be extracted. If it’s 2, only +the first and the last frames will be extracted. If it’s larger +than 2, in addition to the first and the last frames, other frames +will be extracted uniformly within the video duration.

    • +
    • tag_field_name – the field name to store the tags. It’s +“__dj__video_frame_tags__” in default.

    • args – extra args

    • kwargs – extra args

    @@ -2250,8 +2257,8 @@
    -
    -process_single(sample, context=False)[source]
    +
    +process_single(sample, rank=None, context=False)[source]

    For sample level, sample –> sample

    Parameters:
    @@ -2266,18 +2273,17 @@
    -
    -class data_juicer.ops.mapper.ImageTaggingMapper(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]
    +
    +class data_juicer.ops.mapper.WhitespaceNormalizationMapper(*args, **kwargs)[source]

    Bases: Mapper

    -

    Mapper to generate image tags.

    +

    Mapper to normalize different kinds of whitespaces to whitespace ‘ ‘ (0x20) +in text samples.

    +

    Different kinds of whitespaces can be found here: +https://en.wikipedia.org/wiki/Whitespace_character

    -
    -__init__(tag_field_name: str = '__dj__image_tags__', *args, **kwargs)[source]
    -

    Initialization method. -:param tag_field_name: the field name to store the tags. It’s

    -
    -

    “__dj__image_tags__” in default.

    -
    +
    +__init__(*args, **kwargs)[source]
    +

    Initialization method.

    Parameters:
      @@ -2289,18 +2295,9 @@
    -
    -process_single(sample, rank=None, context=False)[source]
    -

    For sample level, sample –> sample

    -
    -
    Parameters:
    -

    sample – sample to process

    -
    -
    Returns:
    -

    processed sample

    -
    -
    -
    +
    +process_batched(samples)[source]
    +
    diff --git a/genindex.html b/genindex.html index 71efd4518..a34a5abdb 100644 --- a/genindex.html +++ b/genindex.html @@ -270,12 +270,12 @@

    _

  • (data_juicer.ops.mapper.CleanLinksMapper method)
  • (data_juicer.ops.mapper.ExpandMacroMapper method) -
  • -
  • (data_juicer.ops.mapper.ExtractQAMapper method)
  • (data_juicer.ops.mapper.FixUnicodeMapper method)
  • -
  • (data_juicer.ops.mapper.GenerateInstructionMapper method) +
  • (data_juicer.ops.mapper.GenerateQAFromExamplesMapper method) +
  • +
  • (data_juicer.ops.mapper.GenerateQAFromTextMapper method)
  • (data_juicer.ops.mapper.ImageBlurMapper method)
  • @@ -293,7 +293,7 @@

    _

  • (data_juicer.ops.mapper.NlpcdaZhMapper method)
  • -
  • (data_juicer.ops.mapper.OptimizeInstructionMapper method) +
  • (data_juicer.ops.mapper.OptimizeQAMapper method)
  • (data_juicer.ops.mapper.PunctuationNormalizationMapper method)
  • @@ -412,8 +412,12 @@

    B

    @@ -635,8 +639,6 @@

    D

  • module
  • - -
    • data_juicer.ops.mapper @@ -651,6 +653,8 @@

      D

    • module
    + +
    • data_juicer.tools @@ -667,6 +671,36 @@

      D

  • Deduplicator (class in data_juicer.ops)
  • +
  • DEFAULT_EXAMPLE_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) +
  • +
  • DEFAULT_INPUT_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) + +
  • +
  • DEFAULT_OUTPUT_PATTERN (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) + +
  • +
  • DEFAULT_QA_PAIR_TEMPLATE (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) + +
  • +
  • DEFAULT_SYSTEM_PROMPT (data_juicer.ops.mapper.GenerateQAFromExamplesMapper attribute) + +
  • DiversityAnalysis (class in data_juicer.analysis)
  • DocumentDeduplicator (class in data_juicer.ops.deduplicator) @@ -694,11 +728,11 @@

    E

  • execute_and_probe() (data_juicer.core.Adapter static method)
  • Executor (class in data_juicer.core) -
  • -
  • ExpandMacroMapper (class in data_juicer.ops.mapper)
  • @@ -735,14 +767,16 @@

    F

    G