diff --git a/.coveragerc b/.coveragerc index d4a7a6d63..d95c7fc28 100644 --- a/.coveragerc +++ b/.coveragerc @@ -9,3 +9,6 @@ omit = # avoid measuring code of unittest tests/* + +[report] +ignore_errors = True diff --git a/.github/workflows/deploy_sphinx_docs.yml b/.github/workflows/deploy_sphinx_docs.yml index 9c8ae89a0..5cf0205ae 100644 --- a/.github/workflows/deploy_sphinx_docs.yml +++ b/.github/workflows/deploy_sphinx_docs.yml @@ -12,13 +12,16 @@ on: jobs: pages: runs-on: ubuntu-20.04 + strategy: + matrix: + python-version: [ "3.9", "3.10" ] steps: - name: Checkout uses: actions/checkout@v4 - name: Setup Python ${{ matrix.python-version }} uses: actions/setup-python@master with: - python_version: ${{ matrix.python-version }} + python-version: ${{ matrix.python-version }} - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/perf-bench.yml b/.github/workflows/perf-bench.yml new file mode 100644 index 000000000..4094070db --- /dev/null +++ b/.github/workflows/perf-bench.yml @@ -0,0 +1,56 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python + +name: performance_benchmark + +on: + workflow_dispatch: + push: + branches: + - main + +permissions: + contents: read + +env: + ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true + +jobs: + perf_bench: + runs-on: [GPU, unittest] + environment: Testing + steps: + - uses: actions/checkout@v3 + with: + path: dj-${{ github.run_id }} + + - name: Setup docker compose + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose up -d + + - name: Install data-juicer + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head pip install -e .\[all\] + + - name: Clean dataset cache + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head rm -rf /data/huggingface/dataset + + - name: Run performance benchmark standalone + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + run: | + docker compose exec ray-head bash tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }} + + - name: Remove docker compose + working-directory: dj-${{ github.run_id }}/.github/workflows/docker + if: always() + run: | + docker compose down --remove-orphans + + - name: Cleanup workspace + if: always() + run: | + rm -rf dj-${{ github.run_id }} diff --git a/configs/config_all.yaml b/configs/config_all.yaml index ea10be519..756cadd81 100644 --- a/configs/config_all.yaml +++ b/configs/config_all.yaml @@ -15,6 +15,7 @@ text_keys: 'text' # the key name of fi suffixes: [] # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx'] use_cache: true # whether to use the cache management of Hugging Face datasets. It might take up lots of disk space when using cache ds_cache_dir: null # cache dir for Hugging Face datasets. In default, it\'s the same as the environment variable `HF_DATASETS_CACHE`, whose default value is usually "~/.cache/huggingface/datasets". If this argument is set to a valid path by users, it will override the default cache dir +open_monitor: true # Whether to open the monitor to trace resource utilization for each OP during data processing. It\'s True in default. use_checkpoint: false # whether to use the checkpoint management to save the latest version of dataset to work dir when processing. Rerun the same config will reload the checkpoint and skip ops before it. Cache will be disabled when using checkpoint. If args of ops before the checkpoint are changed, all ops will be rerun from the beginning. temp_dir: null # the path to the temp directory to store intermediate caches when cache is disabled, these cache files will be removed on-the-fly. In default, it's None, so the temp dir will be specified by system. NOTICE: you should be caution when setting this argument because it might cause unexpected program behaviors when this path is set to an unsafe directory. open_tracer: false # whether to open the tracer to trace the changes during process. It might take more time when opening tracer @@ -211,6 +212,7 @@ process: radius: 2 # radius of blur kernel - image_tagging_mapper: # Mapper to generate image tags. tag_field_name: '__dj__image_tags__' # the field name to store the tags. It's "__dj__image_tags__" in default. + mem_required: '9GB' - nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library sequential: false # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently. aug_num: 1 # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated. @@ -257,6 +259,12 @@ process: model_params: {} # Parameters for initializing the API model. sampling_params: {} # Extra parameters passed to the API call. - punctuation_normalization_mapper: # normalize unicode punctuations to English punctuations. + - python_python_mapper: # executing Python lambda function defined in a file. + file_path: '' # The path to the Python file containing the function to be executed. + function_name: 'process_single' # The name of the function defined in the file to be executed. + - python_lambda_mapper: # executing Python lambda function on data samples. + lambda_str: '' # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used. + batched: False # A boolean indicating whether to process input data in batches. - remove_bibliography_mapper: # remove bibliography from Latex text. - remove_comments_mapper: # remove comments from Latex text, code, etc. doc_type: tex # comment type you want to remove. Only support 'tex' for now. @@ -375,6 +383,7 @@ process: frame_sampling_method: 'all_keyframes' # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes". frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. tag_field_name: '__dj__video_frame_tags__' # the field name to store the tags. It's "__dj__video_frame_tags__" in default. + mem_required: '9GB' - whitespace_normalization_mapper: # normalize different kinds of whitespaces to English whitespace. # Filter ops @@ -607,6 +616,7 @@ process: frame_num: 3 # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration. tag_field_name: '__dj__video_frame_tags__' # the field name to store the tags. It's "__dj__video_frame_tags__" in default. any_or_all: any # keep this sample when any/all videos meet the filter condition + mem_required: '9GB' - words_num_filter: # filter text with number of words out of specific range lang: en # sample in which language tokenization: false # whether to use model to tokenize documents diff --git a/data_juicer/__init__.py b/data_juicer/__init__.py index 7c02d6d0e..91ce93bae 100644 --- a/data_juicer/__init__.py +++ b/data_juicer/__init__.py @@ -1,4 +1,4 @@ -__version__ = '1.0.0' +__version__ = '1.0.1' import os import subprocess diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py index 76a20b786..71f871f10 100644 --- a/data_juicer/config/config.py +++ b/data_juicer/config/config.py @@ -230,6 +230,12 @@ def init_configs(args: Optional[List[str]] = None): help='The compression method of the cache file, which can be' 'specified in ["gzip", "zstd", "lz4"]. If this parameter is' 'None, the cache file will not be compressed.') + parser.add_argument( + '--open_monitor', + type=bool, + default=True, + help='Whether to open the monitor to trace resource utilization for ' + 'each OP during data processing. It\'s True in default.') parser.add_argument( '--use_checkpoint', type=bool, diff --git a/data_juicer/core/data.py b/data_juicer/core/data.py index 9cef1fe89..361f6e8a0 100644 --- a/data_juicer/core/data.py +++ b/data_juicer/core/data.py @@ -164,13 +164,16 @@ def __getitem__(self, key): res = super().__getitem__(key) return nested_obj_factory(res) - def process(self, - operators, - *, - work_dir=None, - exporter=None, - checkpointer=None, - tracer=None): + def process( + self, + operators, + *, + work_dir=None, + exporter=None, + checkpointer=None, + tracer=None, + open_monitor=True, + ): if operators is None: return self @@ -179,7 +182,8 @@ def process(self, unforkable_operators = set(UNFORKABLE.modules.keys()) # resource utilization monitor - resource_util_list = [] + if open_monitor: + resource_util_list = [] dataset = self try: @@ -196,12 +200,16 @@ def process(self, 'exporter': exporter, 'tracer': tracer, } - dataset, resource_util_per_op = Monitor.monitor_func( - op.run, args=run_args) + if open_monitor: + dataset, resource_util_per_op = Monitor.monitor_func( + op.run, args=run_args) + else: + dataset = op.run(**run_args) # record processed ops if checkpointer is not None: checkpointer.record(op._op_cfg) - resource_util_list.append(resource_util_per_op) + if open_monitor: + resource_util_list.append(resource_util_per_op) end = time() logger.info(f'OP [{op._name}] Done in {end - start:.3f}s. ' f'Left {len(dataset)} samples.') @@ -215,7 +223,10 @@ def process(self, 'last op...') dataset.cleanup_cache_files() checkpointer.save_ckpt(dataset) - if work_dir: + if work_dir and open_monitor: + # get the analyzed version + resource_util_list = Monitor.analyze_resource_util_list( + resource_util_list) monitor_dir = os.path.join(work_dir, 'monitor') os.makedirs(monitor_dir, exist_ok=True) with open(os.path.join(monitor_dir, 'monitor.json'), @@ -225,9 +236,7 @@ def process(self, monitor_dir) return dataset - def map(self, *args, **kargs): - """Override the map func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" + def update_args(self, args, kargs, is_filter=False): if args: args = list(args) # the first positional para is function @@ -253,15 +262,17 @@ def map(self, *args, **kargs): # batched is required for fault-tolerant or batched OP if callable(getattr( called_func.__self__, - 'is_batched_op')) and called_func.__self__.is_batched_op( - ) or not getattr(called_func.__self__, 'turbo', False): + 'is_batched_op')) and called_func.__self__.is_batched_op(): kargs['batched'] = True kargs['batch_size'] = kargs.pop('batch_size', 1) + elif not getattr(called_func.__self__, 'turbo', False): + kargs['batched'] = True + kargs['batch_size'] = 1 else: kargs['batched'] = False - # rank is required for cuda model loading - if callable( + # rank is required for cuda model loading for map + if not is_filter and callable( getattr(called_func.__self__, 'use_cuda')) and called_func.__self__.use_cuda(): kargs['with_rank'] = True @@ -270,6 +281,14 @@ def map(self, *args, **kargs): new_fingerprint = generate_fingerprint(self, *args, **kargs) kargs['new_fingerprint'] = new_fingerprint + return args, kargs + + def map(self, *args, **kargs): + """Override the map func, which is called by most common operations, + such that the processed samples can be accessed by nested manner.""" + + args, kargs = self.update_args(args, kargs) + if cache_utils.CACHE_COMPRESS: decompress(self, kargs['new_fingerprint'], kargs['num_proc'] if 'num_proc' in kargs else 1) @@ -288,38 +307,7 @@ def map(self, *args, **kargs): def filter(self, *args, **kargs): """Override the filter func, which is called by most common operations, such that the processed samples can be accessed by nested manner.""" - if args: - args = list(args) - # the first positional para is function - if args[0] is None: - args[0] = lambda x: nested_obj_factory(x) - else: - args[0] = wrap_func_with_nested_access(args[0]) - called_func = args[0] - else: - if 'function' not in kargs or kargs['function'] is None: - kargs['function'] = lambda x: nested_obj_factory(x) - else: - kargs['function'] = wrap_func_with_nested_access( - kargs['function']) - called_func = kargs['function'] - - # For wrapped function, try to get its unwrapped (bound) method - while not inspect.ismethod(called_func) and hasattr( - called_func, '__wrapped__'): - called_func = called_func.__wrapped__ - - # Batched is always required for fault tolerance - if inspect.ismethod(called_func): - if callable(getattr( - called_func.__self__, - 'is_batched_op')) and called_func.__self__.is_batched_op(): - kargs['batched'] = True - kargs['batch_size'] = kargs.pop('batch_size', 1) - - if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: - new_fingerprint = generate_fingerprint(self, *args, **kargs) - kargs['new_fingerprint'] = new_fingerprint + args, kargs = self.update_args(args, kargs, is_filter=True) # For filter, it involves a map and a filter operations, so the final # cache files includes two sets with different fingerprint (before and diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py index d9445dad0..f78059247 100644 --- a/data_juicer/core/executor.py +++ b/data_juicer/core/executor.py @@ -193,11 +193,14 @@ def run(self, # - If checkpoint is open, clean the cache files after each process logger.info('Processing data...') tstart = time() - dataset = dataset.process(ops, - work_dir=self.work_dir, - exporter=self.exporter, - checkpointer=self.ckpt_manager, - tracer=self.tracer) + dataset = dataset.process( + ops, + work_dir=self.work_dir, + exporter=self.exporter, + checkpointer=self.ckpt_manager, + tracer=self.tracer, + open_monitor=self.cfg.open_monitor, + ) tend = time() logger.info(f'All OPs are done in {tend - tstart:.3f}s.') diff --git a/data_juicer/core/monitor.py b/data_juicer/core/monitor.py index 67f8f62a5..0210e3732 100644 --- a/data_juicer/core/monitor.py +++ b/data_juicer/core/monitor.py @@ -205,7 +205,10 @@ def monitor_func(func, args=None, sample_interval=0.5): resource_util_dict = {} # start monitor - ctx = get_context('fork') + start_method = 'fork' + if os.name == 'nt': # for Windows + start_method = 'spawn' + ctx = get_context(start_method) with ctx.Manager() as manager: mdict = manager.dict() mdict['stop'] = False diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index 1bfc87fca..72618a6bb 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -70,7 +70,7 @@ def wrapper(samples, *args, **kwargs): return wrapper -def catch_map_single_exception(method): +def catch_map_single_exception(method, return_sample=True): """ For single-map sample-level fault tolerance. The input sample is expected batch_size = 1. @@ -92,8 +92,11 @@ def wrapper(sample, *args, **kwargs): if is_batched(sample): try: sample = convert_dict_list_to_list_dict(sample)[0] - res_sample = method(sample, *args, **kwargs) - return convert_list_dict_to_dict_list([res_sample]) + res = method(sample, *args, **kwargs) + if return_sample: + return convert_list_dict_to_dict_list([res]) + else: + return [res] except Exception as e: from loguru import logger logger.error( @@ -166,9 +169,8 @@ def __init__(self, *args, **kwargs): method = wrap_func_with_nested_access(method) setattr(self, name, method) - @classmethod - def is_batched_op(cls): - return cls._batched_op + def is_batched_op(self): + return self._batched_op def process(self, *args, **kwargs): raise NotImplementedError @@ -326,7 +328,8 @@ def __init__(self, *args, **kwargs): else: self.compute_stats = catch_map_single_exception( self.compute_stats_single) - self.process = catch_map_single_exception(self.process_single) + self.process = catch_map_single_exception(self.process_single, + return_sample=False) # set the process method is not allowed to be overridden def __init_subclass__(cls, **kwargs): diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py index e63ba50d9..3c95d1fe6 100644 --- a/data_juicer/ops/mapper/__init__.py +++ b/data_juicer/ops/mapper/__init__.py @@ -30,6 +30,8 @@ from .optimize_response_mapper import OptimizeResponseMapper from .pair_preference_mapper import PairPreferenceMapper from .punctuation_normalization_mapper import PunctuationNormalizationMapper +from .python_file_mapper import PythonFileMapper +from .python_lambda_mapper import PythonLambdaMapper from .remove_bibliography_mapper import RemoveBibliographyMapper from .remove_comments_mapper import RemoveCommentsMapper from .remove_header_mapper import RemoveHeaderMapper @@ -76,17 +78,18 @@ 'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper', 'PairPreferenceMapper', 'PunctuationNormalizationMapper', - 'RemoveBibliographyMapper', 'RemoveCommentsMapper', 'RemoveHeaderMapper', - 'RemoveLongWordsMapper', 'RemoveNonChineseCharacterlMapper', - 'RemoveRepeatSentencesMapper', 'RemoveSpecificCharsMapper', - 'RemoveTableTextMapper', 'RemoveWordsWithIncorrectSubstringsMapper', - 'ReplaceContentMapper', 'SentenceSplitMapper', 'TextChunkMapper', - 'VideoCaptioningFromAudioMapper', 'VideoCaptioningFromFramesMapper', - 'VideoCaptioningFromSummarizerMapper', 'VideoCaptioningFromVideoMapper', - 'VideoExtractFramesMapper', 'VideoFFmpegWrappedMapper', - 'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper', - 'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper', - 'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper', - 'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper', - 'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper' + 'PythonFileMapper', 'PythonLambdaMapper', 'RemoveBibliographyMapper', + 'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper', + 'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper', + 'RemoveSpecificCharsMapper', 'RemoveTableTextMapper', + 'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper', + 'SentenceSplitMapper', 'TextChunkMapper', 'VideoCaptioningFromAudioMapper', + 'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper', + 'VideoCaptioningFromVideoMapper', 'VideoExtractFramesMapper', + 'VideoFFmpegWrappedMapper', 'VideoFaceBlurMapper', + 'VideoRemoveWatermarkMapper', 'VideoResizeAspectRatioMapper', + 'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper', + 'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper', + 'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper', + 'WhitespaceNormalizationMapper' ] diff --git a/data_juicer/ops/mapper/python_file_mapper.py b/data_juicer/ops/mapper/python_file_mapper.py new file mode 100644 index 000000000..b74fd96a1 --- /dev/null +++ b/data_juicer/ops/mapper/python_file_mapper.py @@ -0,0 +1,97 @@ +import importlib.util +import inspect +import os + +from ..base_op import OPERATORS, Mapper + +OP_NAME = 'python_file_mapper' + + +@OPERATORS.register_module(OP_NAME) +class PythonFileMapper(Mapper): + """Mapper for executing Python function defined in a file.""" + + def __init__(self, + file_path: str = '', + function_name: str = 'process_single', + batched: bool = False, + **kwargs): + """ + Initialization method. + + :param file_path: The path to the Python file containing the function + to be executed. + :param function_name: The name of the function defined in the file + to be executed. + :param batched: A boolean indicating whether to process input data in + batches. + :param kwargs: Additional keyword arguments passed to the parent class. + """ + self._batched_op = bool(batched) + super().__init__(**kwargs) + + self.file_path = file_path + self.function_name = function_name + if not file_path: + self.func = lambda sample: sample + else: + self.func = self._load_function() + + def _load_function(self): + if not os.path.isfile(self.file_path): + raise FileNotFoundError( + f"The file '{self.file_path}' does not exist.") + + if not self.file_path.endswith('.py'): + raise ValueError( + f"The file '{self.file_path}' is not a Python file.") + + # Load the module from the file + module_name = os.path.splitext(os.path.basename(self.file_path))[0] + spec = importlib.util.spec_from_file_location(module_name, + self.file_path) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + + # Fetch the specified function from the module + if not hasattr(module, self.function_name): + raise ValueError( + f"Function '{self.function_name}' not found in '{self.file_path}'." # noqa: E501 + ) + + func = getattr(module, self.function_name) + + if not callable(func): + raise ValueError( + f"The attribute '{self.function_name}' is not callable.") + + # Check that the function has exactly one argument + argspec = inspect.getfullargspec(func) + if len(argspec.args) != 1: + raise ValueError( + f"The function '{self.function_name}' must take exactly one argument" # noqa: E501 + ) + + return func + + def process_single(self, sample): + """Invoke the loaded function with the provided sample.""" + result = self.func(sample) + + if not isinstance(result, dict): + raise ValueError( + f'Function must return a dictionary, got {type(result).__name__} instead.' # noqa: E501 + ) + + return result + + def process_batched(self, samples): + """Invoke the loaded function with the provided samples.""" + result = self.func(samples) + + if not isinstance(result, dict): + raise ValueError( + f'Function must return a dictionary, got {type(result).__name__} instead.' # noqa: E501 + ) + + return result diff --git a/data_juicer/ops/mapper/python_lambda_mapper.py b/data_juicer/ops/mapper/python_lambda_mapper.py new file mode 100644 index 000000000..e90c77f48 --- /dev/null +++ b/data_juicer/ops/mapper/python_lambda_mapper.py @@ -0,0 +1,74 @@ +import ast + +from ..base_op import OPERATORS, Mapper + +OP_NAME = 'python_lambda_mapper' + + +@OPERATORS.register_module(OP_NAME) +class PythonLambdaMapper(Mapper): + """Mapper for executing Python lambda function on data samples.""" + + def __init__(self, lambda_str: str = '', batched: bool = False, **kwargs): + """ + Initialization method. + + :param lambda_str: A string representation of the lambda function to be + executed on data samples. If empty, the identity function is used. + :param batched: A boolean indicating whether to process input data in + batches. + :param kwargs: Additional keyword arguments passed to the parent class. + """ + self._batched_op = bool(batched) + super().__init__(**kwargs) + + # Parse and validate the lambda function + if not lambda_str: + self.lambda_func = lambda sample: sample + else: + self.lambda_func = self._create_lambda(lambda_str) + + def _create_lambda(self, lambda_str: str): + # Parse input string into an AST and check for a valid lambda function + try: + node = ast.parse(lambda_str, mode='eval') + + # Check if the body of the expression is a lambda + if not isinstance(node.body, ast.Lambda): + raise ValueError( + 'Input string must be a valid lambda function.') + + # Check that the lambda has exactly one argument + if len(node.body.args.args) != 1: + raise ValueError( + 'Lambda function must have exactly one argument.') + + # Compile the AST to code + compiled_code = compile(node, '', 'eval') + # Safely evaluate the compiled code allowing built-in functions + func = eval(compiled_code, {'__builtins__': __builtins__}) + return func + except Exception as e: + raise ValueError(f'Invalid lambda function: {e}') + + def process_single(self, sample): + # Process the input through the lambda function and return the result + result = self.lambda_func(sample) + + # Check if the result is a valid + if not isinstance(result, dict): + raise ValueError(f'Lambda function must return a dictionary, ' + f'got {type(result).__name__} instead.') + + return result + + def process_batched(self, samples): + # Process the input through the lambda function and return the result + result = self.lambda_func(samples) + + # Check if the result is a valid + if not isinstance(result, dict): + raise ValueError(f'Lambda function must return a dictionary, ' + f'got {type(result).__name__} instead.') + + return result diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index eb521e619..305145e82 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -51,6 +51,11 @@ 'punkt.*.pickle': 'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/' 'data_juicer/models/', + + # ram + 'ram_plus_swin_large_14m.pth': + 'http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/models/' + 'ram_plus_swin_large_14m.pth', } diff --git a/docs/Operators.md b/docs/Operators.md index f24523dc5..1d6b63ec5 100644 --- a/docs/Operators.md +++ b/docs/Operators.md @@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types. | Type | Number | Description | |-----------------------------------|:------:|-------------------------------------------------| | [ Formatter ]( #formatter ) | 9 | Discovers, loads, and canonicalizes source data | -| [ Mapper ]( #mapper ) | 59 | Edits and transforms samples | +| [ Mapper ]( #mapper ) | 61 | Edits and transforms samples | | [ Filter ]( #filter ) | 44 | Filters out low-quality samples | | [ Deduplicator ]( #deduplicator ) | 8 | Detects and removes duplicate samples | | [ Selector ]( #selector ) | 4 | Selects top samples based on ranking | @@ -88,6 +88,8 @@ All the specific operators are listed below, each featured with several capabili | optimize_response_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | Optimize the response in question-answering samples. | [code](../data_juicer/ops/mapper/optimize_response_mapper.py) | [tests](../tests/ops/mapper/test_optimize_response_mapper.py) | | pair_preference_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Construct paired preference samples. | [code](../data_juicer/ops/mapper/pair_preference_mapper.py) | [tests](../tests/ops/mapper/test_pair_preference_mapper.py) | | punctuation_normalization_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Normalizes various Unicode punctuations to their ASCII equivalents | [code](../data_juicer/ops/mapper/punctuation_normalization_mapper.py) | [tests](../tests/ops/mapper/test_punctuation_normalization_mapper.py) | +| python_file_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Executing Python function defined in a file | [code](../data_juicer/ops/mapper/python_file_mapper.py) | [tests](../tests/ops/mapper/test_python_file_mapper.py) | +| python_lambda_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Executing Python lambda function on data samples | [code](../data_juicer/ops/mapper/python_lambda_mapper.py) | [tests](../tests/ops/mapper/test_python_lambda_mapper.py) | | remove_bibliography_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Removes the bibliography of TeX documents | [code](../data_juicer/ops/mapper/remove_bibliography_mapper.py) | [tests](../tests/ops/mapper/test_remove_bibliography_mapper.py) | | remove_comments_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Removes the comments of TeX documents | [code](../data_juicer/ops/mapper/remove_comments_mapper.py) | [tests](../tests/ops/mapper/test_remove_comments_mapper.py) | | remove_header_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names | [code](../data_juicer/ops/mapper/remove_header_mapper.py) | [tests](../tests/ops/mapper/test_remove_header_mapper.py) | diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md index c771a30e9..01f0bdb0a 100644 --- a/docs/Operators_ZH.md +++ b/docs/Operators_ZH.md @@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。 | 类型 | 数量 | 描述 | |------------------------------------|:--:|---------------| | [ Formatter ]( #formatter ) | 9 | 发现、加载、规范化原始数据 | -| [ Mapper ]( #mapper ) | 59 | 对数据样本进行编辑和转换 | +| [ Mapper ]( #mapper ) | 61 | 对数据样本进行编辑和转换 | | [ Filter ]( #filter ) | 44 | 过滤低质量样本 | | [ Deduplicator ]( #deduplicator ) | 8 | 识别、删除重复样本 | | [ Selector ]( #selector ) | 4 | 基于排序选取高质量样本 | @@ -87,6 +87,8 @@ Data-Juicer 中的算子分为以下 5 种类型。 | optimize_response_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 指令优化,优化 response | [code](../data_juicer/ops/mapper/optimize_response_mapper.py) | [tests](../tests/ops/mapper/test_optimize_response_mapper.py) | | pair_preference_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 构造配对的偏好样本 | [code](../data_juicer/ops/mapper/pair_preference_mapper.py) | [tests](../tests/ops/mapper/test_pair_preference_mapper.py) | | punctuation_normalization_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 将各种 Unicode 标点符号标准化为其 ASCII 等效项 | [code](../data_juicer/ops/mapper/punctuation_normalization_mapper.py) | [tests](../tests/ops/mapper/test_punctuation_normalization_mapper.py) | +| python_file_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 执行文件中定义的 Python 函数处理样本 | [code](../data_juicer/ops/mapper/python_file_mapper.py) | [tests](../tests/ops/mapper/test_python_file_mapper.py) | +| python_lambda_mapper | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 执行 Python lambda 函数处理样本 | [code](../data_juicer/ops/mapper/python_lambda_mapper.py) | [tests](../tests/ops/mapper/test_python_lambda_mapper.py) | | remove_bibliography_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 删除 TeX 文档的参考文献 | [code](../data_juicer/ops/mapper/remove_bibliography_mapper.py) | [tests](../tests/ops/mapper/test_remove_bibliography_mapper.py) | | remove_comments_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 删除 TeX 文档中的注释 | [code](../data_juicer/ops/mapper/remove_comments_mapper.py) | [tests](../tests/ops/mapper/test_remove_comments_mapper.py) | | remove_header_mapper | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 删除 TeX 文档头,例如标题、章节数字/名称等 | [code](../data_juicer/ops/mapper/remove_header_mapper.py) | [tests](../tests/ops/mapper/test_remove_header_mapper.py) | @@ -117,42 +119,42 @@ Data-Juicer 中的算子分为以下 5 种类型。 ## Filter -| 算子 | 标签 | 描述 | 源码 | 单测样例 | -|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|--------------------------------------------------------------------------|--------------------------------------------------------------------------| -| alphanumeric_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留字母数字比例在指定范围内的样本 | [code](../data_juicer/ops/filter/alphanumeric_filter.py) | [tests](../tests/ops/filter/test_alphanumeric_filter.py) | -| audio_duration_filter | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic) | 保留包含音频的时长在指定范围内的样本 | [code](../data_juicer/ops/filter/audio_duration_filter.py) | [tests](../tests/ops/filter/test_audio_duration_filter.py) | -| audio_nmf_snr_filter | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic) | 保留包含音频信噪比SNR(基于非负矩阵分解方法NMF计算)在指定范围内的样本 | [code](../data_juicer/ops/filter/audio_nmf_snr_filter.py) | [tests](../tests/ops/filter/test_audio_nmf_snr_filter.py) | -| audio_size_filter | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic) | 保留包含音频的大小(bytes)在指定范围内的样本 | [code](../data_juicer/ops/filter/audio_size_filter.py) | [tests](../tests/ops/filter/test_audio_size_filter.py) | -| average_line_length_filter | ![Code](https://img.shields.io/badge/Code-590F08?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留平均行长度在指定范围内的样本 | [code](../data_juicer/ops/filter/average_line_length_filter.py) | [tests](../tests/ops/filter/test_average_line_length_filter.py) | -| character_repetition_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留 char-level n-gram 重复比率在指定范围内的样本 | [code](../data_juicer/ops/filter/character_repetition_filter.py) | [tests](../tests/ops/filter/test_character_repetition_filter.py) | -| flagged_words_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留使标记字比率保持在指定阈值以下的样本 | [code](../data_juicer/ops/filter/flagged_words_filter.py) | [tests](../tests/ops/filter/test_flagged_words_filter.py) | -| image_aesthetics_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留包含美学分数在指定范围内的图像的样本 | [code](../data_juicer/ops/filter/image_aesthetics_filter.py) | [tests](../tests/ops/filter/test_image_aesthetics_filter.py) | -| image_aspect_ratio_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片的宽高比在指定范围内的样本 | [code](../data_juicer/ops/filter/image_aspect_ratio_filter.py) | [tests](../tests/ops/filter/test_image_aspect_ratio_filter.py) | -| image_face_count_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片中检测到的人脸数目在指定范围内的样本 | [code](../data_juicer/ops/filter/image_face_count_filter.py) | [tests](../tests/ops/filter/test_image_face_count_filter.py) | -| image_face_ratio_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片的最大脸部区域在指定范围内的样本 | [code](../data_juicer/ops/filter/image_face_ratio_filter.py) | [tests](../tests/ops/filter/test_image_face_ratio_filter.py) | -| image_nsfw_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留包含NSFW分数在指定阈值之下的图像的样本 | [code](../data_juicer/ops/filter/image_nsfw_filter.py) | [tests](../tests/ops/filter/test_image_nsfw_filter.py) | -| image_pair_similarity_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留图像特征余弦相似度(基于CLIP模型)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_pair_similarity_filter.py) | [tests](../tests/ops/filter/test_image_pair_similarity_filter.py) | -| image_shape_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片的形状(即宽和高)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_shape_filter.py) | [tests](../tests/ops/filter/test_image_shape_filter.py) | -| image_size_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片的大小(bytes)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_size_filter.py) | [tests](../tests/ops/filter/test_image_size_filter.py) | -| image_text_matching_filter | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_text_matching_filter.py) | [tests](../tests/ops/filter/test_image_text_matching_filter.py) | -| image_text_similarity_filter | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_text_similarity_filter.py) | [tests](../tests/ops/filter/test_image_text_similarity_filter.py) | -| image_watermark_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留包含有水印概率在指定阈值之下的图像的样本 | [code](../data_juicer/ops/filter/image_watermark_filter.py) | [tests](../tests/ops/filter/test_image_watermark_filter.py) | -| language_id_score_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留特定语言的样本,通过预测的置信度得分来判断 | [code](../data_juicer/ops/filter/language_id_score_filter.py) | [tests](../tests/ops/filter/test_language_id_score_filter.py) | -| maximum_line_length_filter | ![Code](https://img.shields.io/badge/Code-590F08?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留最大行长度在指定范围内的样本 | [code](../data_juicer/ops/filter/maximum_line_length_filter.py) | [tests](../tests/ops/filter/test_maximum_line_length_filter.py) | -| perplexity_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留困惑度低于指定阈值的样本 | [code](../data_juicer/ops/filter/perplexity_filter.py) | [tests](../tests/ops/filter/test_perplexity_filter.py) | -| phrase_grounding_recall_filter | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留从文本中提取的名词短语在图像中的定位召回率在一定范围内的样本 | [code](../data_juicer/ops/filter/phrase_grounding_recall_filter.py) | [tests](../tests/ops/filter/test_phrase_grounding_recall_filter.py) | -| special_characters_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留 special-char 比率的在指定范围内的样本 | [code](../data_juicer/ops/filter/special_characters_filter.py) | [tests](../tests/ops/filter/test_special_characters_filter.py) | -| specified_field_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 根据字段过滤样本,要求字段的值处于指定目标中 | [code](../data_juicer/ops/filter/specified_field_filter.py) | [tests](../tests/ops/filter/test_specified_field_filter.py) | -| specified_numeric_field_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 根据字段过滤样本,要求字段的值处于指定范围(针对数字类型) | [code](../data_juicer/ops/filter/specified_numeric_field_filter.py) | [tests](../tests/ops/filter/test_specified_numeric_field_filter.py) | -| stopwords_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留停用词比率高于指定阈值的样本 | [code](../data_juicer/ops/filter/stopwords_filter.py) | [tests](../tests/ops/filter/test_stopwords_filter.py) | -| suffix_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留包含特定后缀的样本 | [code](../data_juicer/ops/filter/suffix_filter.py) | [tests](../tests/ops/filter/test_suffix_filter.py) | -| text_action_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留文本部分包含动作的样本 | [code](../data_juicer/ops/filter/text_action_filter.py) | [tests](../tests/ops/filter/test_text_action_filter.py) | -| text_entity_dependency_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留文本部分的依存树中具有非独立实体的样本 | [code](../data_juicer/ops/filter/text_entity_dependency_filter.py) | [tests](../tests/ops/filter/test_text_entity_dependency_filter.py) | -| text_length_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留总文本长度在指定范围内的样本 | [code](../data_juicer/ops/filter/text_length_filter.py) | [tests](../tests/ops/filter/test_text_length_filter.py) | -| token_num_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留token数在指定范围内的样本 | [code](../data_juicer/ops/filter/token_num_filter.py) | [tests](../tests/ops/filter/test_token_num_filter.py) | -| video_aspect_ratio_filter | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留包含视频的宽高比在指定范围内的样本 | [code](../data_juicer/ops/filter/video_aesthetics_filter.py) | [tests](../tests/ops/filter/test_video_aesthetics_filter.py) | -| video_duration_filter | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) | 保留包含视频的时长在指定范围内的样本 | [code](../data_juicer/ops/filter/video_aspect_ratio_filter.py) | [tests](../tests/ops/filter/test_video_aspect_ratio_filter.py) | -| video_aesthetics_filter | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) | 保留指定帧的美学分数在指定范围内的样本 | [code](../data_juicer/ops/filter/video_duration_filter.py) | [tests](../tests/ops/filter/test_video_duration_filter.py) | +| 算子 | 标签 | 描述 | 源码 | 单测样例 | +|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|--------------------------------------------------------------------------|--------------------------------------------------------------------------| +| alphanumeric_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留字母数字比例在指定范围内的样本 | [code](../data_juicer/ops/filter/alphanumeric_filter.py) | [tests](../tests/ops/filter/test_alphanumeric_filter.py) | +| audio_duration_filter | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic) | 保留包含音频的时长在指定范围内的样本 | [code](../data_juicer/ops/filter/audio_duration_filter.py) | [tests](../tests/ops/filter/test_audio_duration_filter.py) | +| audio_nmf_snr_filter | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic) | 保留包含音频信噪比SNR(基于非负矩阵分解方法NMF计算)在指定范围内的样本 | [code](../data_juicer/ops/filter/audio_nmf_snr_filter.py) | [tests](../tests/ops/filter/test_audio_nmf_snr_filter.py) | +| audio_size_filter | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic) | 保留包含音频的大小(bytes)在指定范围内的样本 | [code](../data_juicer/ops/filter/audio_size_filter.py) | [tests](../tests/ops/filter/test_audio_size_filter.py) | +| average_line_length_filter | ![Code](https://img.shields.io/badge/Code-590F08?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留平均行长度在指定范围内的样本 | [code](../data_juicer/ops/filter/average_line_length_filter.py) | [tests](../tests/ops/filter/test_average_line_length_filter.py) | +| character_repetition_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留 char-level n-gram 重复比率在指定范围内的样本 | [code](../data_juicer/ops/filter/character_repetition_filter.py) | [tests](../tests/ops/filter/test_character_repetition_filter.py) | +| flagged_words_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留使标记字比率保持在指定阈值以下的样本 | [code](../data_juicer/ops/filter/flagged_words_filter.py) | [tests](../tests/ops/filter/test_flagged_words_filter.py) | +| image_aesthetics_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留包含美学分数在指定范围内的图像的样本 | [code](../data_juicer/ops/filter/image_aesthetics_filter.py) | [tests](../tests/ops/filter/test_image_aesthetics_filter.py) | +| image_aspect_ratio_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片的宽高比在指定范围内的样本 | [code](../data_juicer/ops/filter/image_aspect_ratio_filter.py) | [tests](../tests/ops/filter/test_image_aspect_ratio_filter.py) | +| image_face_count_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片中检测到的人脸数目在指定范围内的样本 | [code](../data_juicer/ops/filter/image_face_count_filter.py) | [tests](../tests/ops/filter/test_image_face_count_filter.py) | +| image_face_ratio_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片的最大脸部区域在指定范围内的样本 | [code](../data_juicer/ops/filter/image_face_ratio_filter.py) | [tests](../tests/ops/filter/test_image_face_ratio_filter.py) | +| image_nsfw_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留包含NSFW分数在指定阈值之下的图像的样本 | [code](../data_juicer/ops/filter/image_nsfw_filter.py) | [tests](../tests/ops/filter/test_image_nsfw_filter.py) | +| image_pair_similarity_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留图像特征余弦相似度(基于CLIP模型)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_pair_similarity_filter.py) | [tests](../tests/ops/filter/test_image_pair_similarity_filter.py) | +| image_shape_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片的形状(即宽和高)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_shape_filter.py) | [tests](../tests/ops/filter/test_image_shape_filter.py) | +| image_size_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) | 保留样本中包含的图片的大小(bytes)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_size_filter.py) | [tests](../tests/ops/filter/test_image_size_filter.py) | +| image_text_matching_filter | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_text_matching_filter.py) | [tests](../tests/ops/filter/test_image_text_matching_filter.py) | +| image_text_similarity_filter | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 | [code](../data_juicer/ops/filter/image_text_similarity_filter.py) | [tests](../tests/ops/filter/test_image_text_similarity_filter.py) | +| image_watermark_filter | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留包含有水印概率在指定阈值之下的图像的样本 | [code](../data_juicer/ops/filter/image_watermark_filter.py) | [tests](../tests/ops/filter/test_image_watermark_filter.py) | +| language_id_score_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留特定语言的样本,通过预测的置信度得分来判断 | [code](../data_juicer/ops/filter/language_id_score_filter.py) | [tests](../tests/ops/filter/test_language_id_score_filter.py) | +| maximum_line_length_filter | ![Code](https://img.shields.io/badge/Code-590F08?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留最大行长度在指定范围内的样本 | [code](../data_juicer/ops/filter/maximum_line_length_filter.py) | [tests](../tests/ops/filter/test_maximum_line_length_filter.py) | +| perplexity_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留困惑度低于指定阈值的样本 | [code](../data_juicer/ops/filter/perplexity_filter.py) | [tests](../tests/ops/filter/test_perplexity_filter.py) | +| phrase_grounding_recall_filter | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留从文本中提取的名词短语在图像中的定位召回率在一定范围内的样本 | [code](../data_juicer/ops/filter/phrase_grounding_recall_filter.py) | [tests](../tests/ops/filter/test_phrase_grounding_recall_filter.py) | +| special_characters_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留 special-char 比率的在指定范围内的样本 | [code](../data_juicer/ops/filter/special_characters_filter.py) | [tests](../tests/ops/filter/test_special_characters_filter.py) | +| specified_field_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 根据字段过滤样本,要求字段的值处于指定目标中 | [code](../data_juicer/ops/filter/specified_field_filter.py) | [tests](../tests/ops/filter/test_specified_field_filter.py) | +| specified_numeric_field_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 根据字段过滤样本,要求字段的值处于指定范围(针对数字类型) | [code](../data_juicer/ops/filter/specified_numeric_field_filter.py) | [tests](../tests/ops/filter/test_specified_numeric_field_filter.py) | +| stopwords_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留停用词比率高于指定阈值的样本 | [code](../data_juicer/ops/filter/stopwords_filter.py) | [tests](../tests/ops/filter/test_stopwords_filter.py) | +| suffix_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留包含特定后缀的样本 | [code](../data_juicer/ops/filter/suffix_filter.py) | [tests](../tests/ops/filter/test_suffix_filter.py) | +| text_action_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留文本部分包含动作的样本 | [code](../data_juicer/ops/filter/text_action_filter.py) | [tests](../tests/ops/filter/test_text_action_filter.py) | +| text_entity_dependency_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留文本部分的依存树中具有非独立实体的样本 | [code](../data_juicer/ops/filter/text_entity_dependency_filter.py) | [tests](../tests/ops/filter/test_text_entity_dependency_filter.py) | +| text_length_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) | 保留总文本长度在指定范围内的样本 | [code](../data_juicer/ops/filter/text_length_filter.py) | [tests](../tests/ops/filter/test_text_length_filter.py) | +| token_num_filter | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留token数在指定范围内的样本 | [code](../data_juicer/ops/filter/token_num_filter.py) | [tests](../tests/ops/filter/test_token_num_filter.py) | +| video_aesthetics_filter | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留指定帧的美学分数在指定范围内的样本 | [code](../data_juicer/ops/filter/video_aesthetics_filter.py) | [tests](../tests/ops/filter/test_video_aesthetics_filter.py) | +| video_aspect_ratio_filter | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) | 保留包含视频的宽高比在指定范围内的样本 | [code](../data_juicer/ops/filter/video_aspect_ratio_filter.py) | [tests](../tests/ops/filter/test_video_aspect_ratio_filter.py) | +| video_duration_filter | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) | 保留包含视频的时长在指定范围内的样本 | [code](../data_juicer/ops/filter/video_duration_filter.py) | [tests](../tests/ops/filter/test_video_duration_filter.py) | | video_frames_text_similarity_filter | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留视频中指定帧的图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 | [code](../data_juicer/ops/filter/video_frames_text_similarity_filter.py) | [tests](../tests/ops/filter/test_video_frames_text_similarity_filter.py) | | video_motion_score_filter | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) | 保留包含视频的运动分数(基于稠密光流)在指定范围内的样本 | [code](../data_juicer/ops/filter/video_motion_score_filter.py) | [tests](../tests/ops/filter/test_video_motion_score_filter.py) | | video_motion_score_raft_filter | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) | 保留包含视频的运动分数(基于 RAFT 模型估计的稠密光流)在指定范围内的样本 | [code](../data_juicer/ops/filter/video_motion_score_raft_raft_filter.py) | [tests](../tests/ops/filter/test_video_motion_score_filter.py) | diff --git a/environments/dev_requires.txt b/environments/dev_requires.txt index 9793d5746..0ecd058c4 100644 --- a/environments/dev_requires.txt +++ b/environments/dev_requires.txt @@ -4,3 +4,4 @@ sphinx sphinx-autobuild sphinx_rtd_theme recommonmark +wandb diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt index 7d37959fe..414458edc 100644 --- a/environments/minimal_requires.txt +++ b/environments/minimal_requires.txt @@ -2,9 +2,13 @@ datasets>=2.19.0 fsspec==2023.5.0 pandas numpy -av +av==13.1.0 soundfile +# need to install two dependencies by librosa to avoid lazy_loader error librosa>=0.10 +samplerate +resampy +# need to install two dependencies by librosa to avoid lazy_loader error loguru tabulate tqdm diff --git a/environments/science_requires.txt b/environments/science_requires.txt index f1e613126..10ea3b86e 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -11,7 +11,7 @@ selectolax nlpaug nlpcda nltk<3.9 -transformers>=4.37 +transformers>=4.47.0 transformers_stream_generator einops accelerate diff --git a/tests/benchmark_performance/configs/audio.yaml b/tests/benchmark_performance/configs/audio.yaml new file mode 100644 index 000000000..848c537b0 --- /dev/null +++ b/tests/benchmark_performance/configs/audio.yaml @@ -0,0 +1,14 @@ +# The config file for performance benchmark to measure the processing speed for +# the current Data-Juicer system. OPs are selected according to their tags and +# types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md) + +project_name: 'performance-benchmark-audio' +dataset_path: 'perf_bench_data/audio/audio-10k.jsonl' +export_path: 'outputs/performance_benchmark_audio/res.jsonl' +np: 16 +use_cache: false + +process: + - audio_duration_filter: + - audio_nmf_snr_filter: + - audio_size_filter: diff --git a/tests/benchmark_performance/configs/image.yaml b/tests/benchmark_performance/configs/image.yaml new file mode 100644 index 000000000..3ce03be53 --- /dev/null +++ b/tests/benchmark_performance/configs/image.yaml @@ -0,0 +1,23 @@ +# The config file for performance benchmark to measure the processing speed for +# the current Data-Juicer system. OPs are selected according to their tags and +# types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md) + +project_name: 'performance-benchmark-image' +dataset_path: 'perf_bench_data/image/10k.jsonl' +export_path: 'outputs/performance_benchmark_image/res.jsonl' +np: 16 +use_cache: false + +process: + - image_aesthetics_filter: + hf_scorer_model: 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' + min_score: 0.0 + mem_required: '1500MB' + - image_captioning_mapper: + hf_img2seq: 'Salesforce/blip2-opt-2.7b' + caption_num: 1 + keep_original_sample: false + mem_required: '16GB' + - image_shape_filter: + - image_blur_mapper: + - image_deduplicator: diff --git a/tests/benchmark_performance/configs/text.yaml b/tests/benchmark_performance/configs/text.yaml new file mode 100644 index 000000000..8b39bbeb8 --- /dev/null +++ b/tests/benchmark_performance/configs/text.yaml @@ -0,0 +1,21 @@ +# The config file for performance benchmark to measure the processing speed for +# the current Data-Juicer system. OPs are selected according to their tags and +# types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md) + +project_name: 'performance-benchmark-text' +dataset_path: 'perf_bench_data/text/wiki-10k.jsonl' +export_path: 'outputs/performance_benchmark_text/res.jsonl' +np: 16 +use_cache: false + +process: + - whitespace_normalization_mapper: + - token_num_filter: + hf_tokenizer: 'EleutherAI/pythia-6.9b-deduped' + min_num: 0 + - document_deduplicator: + lowercase: false + ignore_non_character: false + - topk_specified_field_selector: + field_key: '__dj__stats__.num_token' + topk: 1000 diff --git a/tests/benchmark_performance/configs/video.yaml b/tests/benchmark_performance/configs/video.yaml new file mode 100644 index 000000000..28fb3b98a --- /dev/null +++ b/tests/benchmark_performance/configs/video.yaml @@ -0,0 +1,21 @@ +# The config file for performance benchmark to measure the processing speed for +# the current Data-Juicer system. OPs are selected according to their tags and +# types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md) + +project_name: 'performance-benchmark-video' +dataset_path: 'perf_bench_data/video/msr_vtt_train.jsonl' +export_path: 'outputs/performance_benchmark_video/res.jsonl' +np: 16 +use_cache: false + +process: + - video_nsfw_filter: + hf_nsfw_model: 'Falconsai/nsfw_image_detection' + score_threshold: 1.0 + mem_required: '1GB' + - video_tagging_from_frames_mapper: + mem_required: '9GB' + - video_duration_filter: + - video_split_by_key_frame_mapper: + keep_original_sample: false + - video_deduplicator: diff --git a/tests/benchmark_performance/report.py b/tests/benchmark_performance/report.py new file mode 100644 index 000000000..e53afa63a --- /dev/null +++ b/tests/benchmark_performance/report.py @@ -0,0 +1,126 @@ +import wandb +import fire +import os +import json +import yaml +import regex as re +from loguru import logger + +PROJECT = 'Data-Juicer Reports' +RUN_NAME = 'Performance Benchmark -- %s' +MODALITIES = {'text', 'image', 'video', 'audio'} +DIFF_TH = 0.1 + +def get_run_id(project, run_name, entity='dail'): + api = wandb.Api() + runs = api.runs(path=f'{entity}/{project}') + for run in runs: + if run.name == run_name: + return run.id + return '' + +def init_run(modality, config=None): + # get the run object for specified modality + # if it's not existed, create one + # if it's existed, get the run id and resume from it + run_id = get_run_id(PROJECT, RUN_NAME % modality) + if run_id == '': + # no existing run, create one + run = wandb.init(project=PROJECT, + config=config, + tags=['performance benchmark', modality], + name=RUN_NAME % modality) + run_id = get_run_id(PROJECT, RUN_NAME % modality) + else: + run = wandb.init(project=PROJECT, + id=run_id, + resume='must') + return run, run_id + +def main(): + wandb.login() + for modality in MODALITIES: + logger.info(f'--------------- {modality} ---------------') + work_dir = f'outputs/performance_benchmark_{modality}/' + + # read config + with open(os.path.join(work_dir, f'{modality}.yaml')) as fin: + config = yaml.load(fin, yaml.FullLoader) + + # init the wandb run + run, run_id = init_run(modality, config) + + # collect results from logs + log_pt = r'export_(.*?)_time_(\d*?).txt' + log_dir = os.path.join(work_dir, 'log') + log_files = os.listdir(log_dir) + log_file = None + for fn in log_files: + if re.match(log_pt, fn): + log_file = fn + break + if log_file is None: + logger.warning('No log files found.') + exit() + log_file = os.path.join(log_dir, log_file) + with open(log_file) as fin: + log_content = fin.read() + op_pt = r'OP \[(.*?)\] Done in (.*?)s' + total_pt = r'All OPs are done in (.*?)s' + op_data = re.findall(op_pt, log_content) + ops = [it[0] for it in op_data] + total_data = re.findall(total_pt, log_content) + + res = dict(op_data) + res['total_time'] = total_data[0] + res = {key: {'time': float(res[key])} for key in res} + + # collect resource utilization from monitor logs + monitor_file = os.path.join(work_dir, 'monitor', 'monitor.json') + with open(monitor_file) as fin: + monitor_res = json.load(fin) + assert len(monitor_res) == len(ops) + for op, resource_util_dict in zip(ops, monitor_res): + res[op].update(resource_util_dict['resource_analysis']) + + # upload results and finish the run + upload_res = { + modality: res + } + run.log(upload_res) + run.finish() + + # compare with the last run + api = wandb.Api() + api_run = api.run(f'{PROJECT}/{run_id}') + run_history = api_run.history() + if len(run_history) < 2: + continue + last_record = run_history.iloc[-2] + + for op_name, time in op_data: + last_time = last_record[f'{modality}.{op_name}.time'] + this_time = res[op_name]['time'] + dif = (this_time - last_time) / last_time + if dif > 0.1: + logger.warning(f'Time cost for OP {[op_name]} increased by ' + f'{dif * 100}% (> 10%). Before-{last_time} vs. ' + f'Now-{this_time}') + else: + logger.info(f'Time cost for OP {[op_name]} increased by ' + f'{dif * 100}%. Before-{last_time} vs. ' + f'Now-{this_time}') + last_total = last_record[f'{modality}.total_time.time'] + this_total = res['total_time']['time'] + dif_total = (this_total - last_total) / last_total + if dif_total > 0.1: + logger.warning(f'Total time cost increased by {dif_total * 100}% ' + f'(> 10%). Before-{last_total} vs. ' + f'Now-{this_total}') + else: + logger.info(f'Total time cost increased by {dif_total * 100}%. ' + f'Before-{last_total} vs. Now-{this_total}') + + +if __name__ == '__main__': + fire.Fire(main) diff --git a/tests/benchmark_performance/run.sh b/tests/benchmark_performance/run.sh new file mode 100644 index 000000000..1ec839d57 --- /dev/null +++ b/tests/benchmark_performance/run.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +# setup wandb configs +export WANDB_BASE_URL=$1 +export WANDB_API_KEY=$2 + +BENCH_PATH=$(cd "$(dirname "$0")"; pwd) +RELATIVE_DJ_PATH=../.. +MODALITIES=("text" "image" "video" "audio") + +cd $BENCH_PATH + +# 1. prepare dataset +wget -q http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxf perf_bench_data.tar.gz + +# 2. run the benchmark +for modality in ${MODALITIES[@]} +do + python $RELATIVE_DJ_PATH/tools/process_data.py --config configs/$modality.yaml +done + +# 3. collect & upload benchmark results +python report.py + +# 4. clear resources +rm -rf perf_bench_data.tar.gz +rm -rf perf_bench_data/ diff --git a/tests/ops/data/img1_dup.png b/tests/ops/data/img1_dup.png new file mode 120000 index 000000000..60579791d --- /dev/null +++ b/tests/ops/data/img1_dup.png @@ -0,0 +1 @@ +/apsarapangu/disk2/jiangnana.jnn/workspace/data-juicer-fork/data-juicer/tests/ops/deduplicator/../data/img1.png \ No newline at end of file diff --git a/tests/ops/data/img2_dup.jpg b/tests/ops/data/img2_dup.jpg new file mode 120000 index 000000000..4ff22937c --- /dev/null +++ b/tests/ops/data/img2_dup.jpg @@ -0,0 +1 @@ +/apsarapangu/disk2/jiangnana.jnn/workspace/data-juicer-fork/data-juicer/tests/ops/deduplicator/../data/img2.jpg \ No newline at end of file diff --git a/tests/ops/data/img3_dup.jpg b/tests/ops/data/img3_dup.jpg new file mode 120000 index 000000000..aad69094d --- /dev/null +++ b/tests/ops/data/img3_dup.jpg @@ -0,0 +1 @@ +/apsarapangu/disk2/jiangnana.jnn/workspace/data-juicer-fork/data-juicer/tests/ops/deduplicator/../data/img3.jpg \ No newline at end of file diff --git a/tests/ops/data/img3_dup_dup.jpg b/tests/ops/data/img3_dup_dup.jpg new file mode 120000 index 000000000..525e66660 --- /dev/null +++ b/tests/ops/data/img3_dup_dup.jpg @@ -0,0 +1 @@ +/apsarapangu/disk2/jiangnana.jnn/workspace/data-juicer-fork/data-juicer/tests/ops/deduplicator/../data/img3_dup.jpg \ No newline at end of file diff --git a/tests/ops/data/video1_dup.mp4 b/tests/ops/data/video1_dup.mp4 new file mode 120000 index 000000000..0235123a8 --- /dev/null +++ b/tests/ops/data/video1_dup.mp4 @@ -0,0 +1 @@ +/apsarapangu/disk2/jiangnana.jnn/workspace/data-juicer-fork/data-juicer/tests/ops/deduplicator/../data/video1.mp4 \ No newline at end of file diff --git a/tests/ops/data/video2_dup.mp4 b/tests/ops/data/video2_dup.mp4 new file mode 120000 index 000000000..bc5b6ea2a --- /dev/null +++ b/tests/ops/data/video2_dup.mp4 @@ -0,0 +1 @@ +/apsarapangu/disk2/jiangnana.jnn/workspace/data-juicer-fork/data-juicer/tests/ops/deduplicator/../data/video2.mp4 \ No newline at end of file diff --git a/tests/ops/data/video3_dup.mp4 b/tests/ops/data/video3_dup.mp4 new file mode 120000 index 000000000..41fa9151f --- /dev/null +++ b/tests/ops/data/video3_dup.mp4 @@ -0,0 +1 @@ +/apsarapangu/disk2/jiangnana.jnn/workspace/data-juicer-fork/data-juicer/tests/ops/deduplicator/../data/video3.mp4 \ No newline at end of file diff --git a/tests/ops/data/video3_dup_dup.mp4 b/tests/ops/data/video3_dup_dup.mp4 new file mode 120000 index 000000000..b6a36a44c --- /dev/null +++ b/tests/ops/data/video3_dup_dup.mp4 @@ -0,0 +1 @@ +/apsarapangu/disk2/jiangnana.jnn/workspace/data-juicer-fork/data-juicer/tests/ops/deduplicator/../data/video3_dup.mp4 \ No newline at end of file diff --git a/tests/ops/filter/test_audio_duration_filter.py b/tests/ops/filter/test_audio_duration_filter.py index 5b367f0ec..64a5c05c8 100644 --- a/tests/ops/filter/test_audio_duration_filter.py +++ b/tests/ops/filter/test_audio_duration_filter.py @@ -5,11 +5,8 @@ from data_juicer.ops.filter.audio_duration_filter import AudioDurationFilter from data_juicer.utils.constant import Fields -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG, SKIPPED_TESTS +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG -# skip due to conflicts when run lazy_load in multiprocessing in librosa -# tests passed locally. -@SKIPPED_TESTS.register_module() class AudioDurationFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_audio_nmf_snr_filter.py b/tests/ops/filter/test_audio_nmf_snr_filter.py index 384435828..d0dec38b8 100644 --- a/tests/ops/filter/test_audio_nmf_snr_filter.py +++ b/tests/ops/filter/test_audio_nmf_snr_filter.py @@ -5,11 +5,8 @@ from data_juicer.ops.filter.audio_nmf_snr_filter import AudioNMFSNRFilter from data_juicer.utils.constant import Fields -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase -# skip due to conflicts when run lazy_load in multiprocessing in librosa -# tests passed locally. -@SKIPPED_TESTS.register_module() class AudioNMFSNRFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/mapper/test_python_file_mapper.py b/tests/ops/mapper/test_python_file_mapper.py new file mode 100644 index 000000000..97d280481 --- /dev/null +++ b/tests/ops/mapper/test_python_file_mapper.py @@ -0,0 +1,108 @@ +import unittest +import tempfile + +from data_juicer.ops.mapper.python_file_mapper import PythonFileMapper +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + +class TestPythonFileMapper(DataJuicerTestCaseBase): + + def test_function_execution(self): + """Test the correct execution of a loadable function.""" + with tempfile.NamedTemporaryFile(delete=True, suffix='.py', mode='w+') as temp_file: + temp_file.write( + "def process_data(sample):\n" + " return {'result': sample['value'] + 10}\n" + ) + temp_file.seek(0) # Rewind the file so it can be read + mapper = PythonFileMapper(temp_file.name, "process_data") + result = mapper.process_single({'value': 5}) + self.assertEqual(result, {'result': 15}) + + def test_function_batched(self): + """Test for a funtion that processes a batch.""" + with tempfile.NamedTemporaryFile(delete=True, suffix='.py', mode='w+') as temp_file: + temp_file.write( + "def process_data(samples):\n" + " return {'result': samples['value'] + [10]}\n" + ) + temp_file.seek(0) # Rewind the file so it can be read + mapper = PythonFileMapper(temp_file.name, "process_data", batched=True) + result = mapper.process_batched({'value': [5]}) + self.assertEqual(result, {'result': [5, 10]}) + + def test_function_with_import(self): + """Test for a function that contains an import statement.""" + with tempfile.NamedTemporaryFile(delete=True, suffix='.py', mode='w+') as temp_file: + temp_file.write( + "import numpy as np\n" + "def process_data(sample):\n" + " return {'result': np.sum([sample['value'], 10])}\n" + ) + temp_file.seek(0) # Rewind the file so it can be read + mapper = PythonFileMapper(temp_file.name, "process_data") + result = mapper.process_single({'value': 5}) + self.assertEqual(result, {'result': 15}) + + def test_file_not_found(self): + """Test for a non-existent file.""" + with self.assertRaises(FileNotFoundError) as cm: + PythonFileMapper("non_existent.py", "process_data") + self.assertIn("does not exist", str(cm.exception)) + + def test_file_not_python_extension(self): + """Test for a file that exists but is not a .py file.""" + with tempfile.NamedTemporaryFile(delete=True, suffix='.txt', mode='w+') as temp_file: + temp_file.write("This is a text file.") + temp_file.seek(0) # Rewind the file so it can be read + with self.assertRaises(ValueError) as cm: + PythonFileMapper(temp_file.name, "some_function") + self.assertIn("is not a Python file", str(cm.exception)) + + def test_function_not_found(self): + """Test for function not existing in the provided file.""" + with tempfile.NamedTemporaryFile(delete=True, suffix='.py', mode='w+') as temp_file: + temp_file.write( + "def existing_function(sample):\n" + " return sample\n" + ) + temp_file.seek(0) # Rewind the file so it can be read + with self.assertRaises(ValueError) as cm: + PythonFileMapper(temp_file.name, "non_existing_function") + self.assertIn("not found", str(cm.exception)) + + def test_function_not_callable(self): + """Test for trying to load a non-callable function.""" + with tempfile.NamedTemporaryFile(delete=True, suffix='.py', mode='w+') as temp_file: + temp_file.write("x = 42") + temp_file.seek(0) # Rewind the file so it can be read + with self.assertRaises(ValueError) as cm: + PythonFileMapper(temp_file.name, "x") + self.assertIn("not callable", str(cm.exception)) + + def test_function_mutiple_arguments(self): + """Test for function that requires more than one argument.""" + with tempfile.NamedTemporaryFile(delete=True, suffix='.py', mode='w+') as temp_file: + temp_file.write( + "def multi_arg_function(arg1, arg2):\n" + " return arg1 + arg2\n" + ) + temp_file.seek(0) # Rewind the file so it can be read + with self.assertRaises(ValueError) as cm: + PythonFileMapper(temp_file.name, "multi_arg_function") + self.assertIn("must take exactly one argument", str(cm.exception)) + + def test_invalid_return_type(self): + """Test for a function returning a non-dictionary.""" + with tempfile.NamedTemporaryFile(delete=True, suffix='.py', mode='w+') as temp_file: + temp_file.write( + "def invalid_function(sample):\n" + " return sample['value'] + 5\n" + ) + temp_file.seek(0) # Rewind the file so it can be read + mapper = PythonFileMapper(temp_file.name, "invalid_function") + with self.assertRaises(ValueError) as cm: + mapper.process_single({'value': 5}) + self.assertIn("Function must return a dictionary, got int instead.", str(cm.exception)) + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/ops/mapper/test_python_lambda_mapper.py b/tests/ops/mapper/test_python_lambda_mapper.py new file mode 100644 index 000000000..97fac4794 --- /dev/null +++ b/tests/ops/mapper/test_python_lambda_mapper.py @@ -0,0 +1,68 @@ +import unittest + +from data_juicer.ops.mapper.python_lambda_mapper import PythonLambdaMapper +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase + +class PythonLambdaMapperMapper(DataJuicerTestCaseBase): + + def test_lambda_function_batched(self): + mapper = PythonLambdaMapper("lambda d: {'value': d['value'] + [6]}", batched=True) # Append '6' to value + result = mapper.process_batched({'value': [5]}) + self.assertEqual(result, {'value': [5, 6]}) + + def test_lambda_modifies_values(self): + mapper = PythonLambdaMapper("lambda d: {'value': d['value'] + 1}") # '+1' to 'value' + result = mapper.process_single({'value': 5}) + self.assertEqual(result, {'value': 6}) + + def test_lambda_combines_values(self): + mapper = PythonLambdaMapper("lambda d: {'combined': d['a'] + d['b']}") + result = mapper.process_single({'a': 3, 'b': 7}) + self.assertEqual(result, {'combined': 10}) + + def test_lambda_swaps_values(self): + mapper = PythonLambdaMapper("lambda d: {'a': d['b'], 'b': d['a']}") + result = mapper.process_single({'a': 1, 'b': 2}) + self.assertEqual(result, {'a': 2, 'b': 1}) + + def test_lambda_result_is_not_dict(self): + mapper = PythonLambdaMapper("lambda d: d['value'] + 1") # This returns an int + with self.assertRaises(ValueError) as cm: + mapper.process_single({'value': 10}) + self.assertIn("Lambda function must return a dictionary, got int instead.", str(cm.exception)) + + def test_invalid_syntax(self): + with self.assertRaises(ValueError) as cm: + PythonLambdaMapper("invalid lambda") # Invalid syntax + self.assertIn("Invalid lambda function", str(cm.exception)) + + def test_invalid_expression(self): + with self.assertRaises(ValueError) as cm: + PythonLambdaMapper("3 + 5") # Not a lambda + self.assertIn("Input string must be a valid lambda function.", str(cm.exception)) + + def test_lambda_with_multiple_arguments(self): + with self.assertRaises(ValueError) as cm: + PythonLambdaMapper("lambda x, y: {'sum': x + y}") # Creating a lambda accepts two arguments + self.assertIn("Lambda function must have exactly one argument.", str(cm.exception)) + + def test_lambda_returning_unexpected_structure(self): + mapper = PythonLambdaMapper("lambda d: ({'value': d['value']}, {'extra': d['extra']})") # Invalid return type; too many dictionaries + with self.assertRaises(ValueError) as cm: + mapper.process_single({'value': 5, 'extra': 10}) + self.assertIn("Lambda function must return a dictionary, got tuple instead.", str(cm.exception)) + + def test_lambda_modifies_in_place_and_returns(self): + mapper = PythonLambdaMapper("lambda d: d.update({'new_key': 'added_value'}) or d") # Returns the modified dictionary + sample_dict = {'value': 3} + result = mapper.process_single(sample_dict) + self.assertEqual(result, {'value': 3, 'new_key': 'added_value'}) # Ensure the update worked + + def test_lambda_function_with_no_operation(self): + mapper = PythonLambdaMapper("lambda d: d") # Simply returns the input dictionary + sample_dict = {'key': 'value'} + result = mapper.process_single(sample_dict) + self.assertEqual(result, {'key': 'value'}) # Unchanged + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/ops/mapper/test_video_tagging_from_audio_mapper.py b/tests/ops/mapper/test_video_tagging_from_audio_mapper.py index 5cace0b7a..8bbf05933 100644 --- a/tests/ops/mapper/test_video_tagging_from_audio_mapper.py +++ b/tests/ops/mapper/test_video_tagging_from_audio_mapper.py @@ -6,11 +6,8 @@ VideoTaggingFromAudioMapper from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import SpecialTokens -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase -# skip due to conflicts when run lazy_load in multiprocessing in librosa -# tests passed locally. -@SKIPPED_TESTS.register_module() class VideoTaggingFromAudioMapperTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'data')