From 77b498a6c8511fb518a49f71c783aad5f530cf6c Mon Sep 17 00:00:00 2001
From: Yilun Huang <lielin.hyl@alibaba-inc.com>
Date: Fri, 6 Dec 2024 14:59:52 +0800
Subject: [PATCH] Performance benchmark (#483)

* * prepare the config files for performance benchmark
* add the result report code
* fix errors in docs

* + add performance benchmark script and workflow

* * install and config for wandb

* + set mp start method to 'spawn' for windows in Monitor

* + add a switch arg to control whether to open monitor

* + add analyzed results to the monitor results as well

* * optimize the res structure
* optimize the run initialization

* * fix bugs in running scripts
* add ram downloading link
* specify the version of pyav to 13.1.0

* * modify the dif comparison according to the new log format

* * update transformers to the latest version to avoid "shape mismatch" error from v4.46.3 (from issue #34990 from transformers)
---
 .github/workflows/perf-bench.yml              |  56 ++++++++
 configs/config_all.yaml                       |   1 +
 data_juicer/config/config.py                  |   6 +
 data_juicer/core/data.py                      |  35 +++--
 data_juicer/core/executor.py                  |  13 +-
 data_juicer/core/monitor.py                   |   5 +-
 data_juicer/utils/model_utils.py              |   5 +
 docs/Operators_ZH.md                          |  72 +++++-----
 environments/dev_requires.txt                 |   1 +
 environments/science_requires.txt             |   2 +-
 .../benchmark_performance/configs/audio.yaml  |  14 ++
 .../benchmark_performance/configs/image.yaml  |  23 ++++
 tests/benchmark_performance/configs/text.yaml |  21 +++
 .../benchmark_performance/configs/video.yaml  |  20 +++
 tests/benchmark_performance/report.py         | 126 ++++++++++++++++++
 tests/benchmark_performance/run.sh            |  27 ++++
 16 files changed, 372 insertions(+), 55 deletions(-)
 create mode 100644 .github/workflows/perf-bench.yml
 create mode 100644 tests/benchmark_performance/configs/audio.yaml
 create mode 100644 tests/benchmark_performance/configs/image.yaml
 create mode 100644 tests/benchmark_performance/configs/text.yaml
 create mode 100644 tests/benchmark_performance/configs/video.yaml
 create mode 100644 tests/benchmark_performance/report.py
 create mode 100644 tests/benchmark_performance/run.sh

diff --git a/.github/workflows/perf-bench.yml b/.github/workflows/perf-bench.yml
new file mode 100644
index 000000000..2a4d6658b
--- /dev/null
+++ b/.github/workflows/perf-bench.yml
@@ -0,0 +1,56 @@
+# This workflow will install Python dependencies, run tests and lint with a single version of Python
+# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python
+
+name: performance_benchmark
+
+on:
+  workflow_dispatch:
+  push:
+    branches:
+      - main
+
+permissions:
+  contents: read
+
+env:
+  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
+
+jobs:
+  unittest-single:
+    runs-on: [self-hosted, linux]
+    environment: Testing
+    steps:
+    - uses: actions/checkout@v3
+      with:
+        path: dj-${{ github.run_id }}
+
+    - name: Setup docker compose
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose up -d
+
+    - name: Install data-juicer
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head pip install -e .\[all\]
+
+    - name: Clean dataset cache
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head rm -rf /data/huggingface/dataset
+
+    - name: Run performance benchmark standalone
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      run: |
+        docker compose exec ray-head python tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
+
+    - name: Remove docker compose
+      working-directory: dj-${{ github.run_id }}/.github/workflows/docker
+      if: always()
+      run: |
+        docker compose down --remove-orphans
+
+    - name: Cleanup workspace
+      if: always()
+      run: |
+        rm -rf  dj-${{ github.run_id }}
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 8f60e2e69..4019b66a5 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -15,6 +15,7 @@ text_keys: 'text'                                           # the key name of fi
 suffixes: []                                                # the suffix of files that will be read. For example: '.txt', 'txt' or ['txt', '.pdf', 'docx']
 use_cache: true                                             # whether to use the cache management of Hugging Face datasets. It might take up lots of disk space when using cache
 ds_cache_dir: null                                          # cache dir for Hugging Face datasets. In default, it\'s the same as the environment variable `HF_DATASETS_CACHE`, whose default value is usually "~/.cache/huggingface/datasets". If this argument is set to a valid path by users, it will override the default cache dir
+open_monitor: true                                          # Whether to open the monitor to trace resource utilization for each OP during data processing. It\'s True in default.
 use_checkpoint: false                                       # whether to use the checkpoint management to save the latest version of dataset to work dir when processing. Rerun the same config will reload the checkpoint and skip ops before it. Cache will be disabled when using checkpoint. If args of ops before the checkpoint are changed, all ops will be rerun from the beginning.
 temp_dir: null                                              # the path to the temp directory to store intermediate caches when cache is disabled, these cache files will be removed on-the-fly. In default, it's None, so the temp dir will be specified by system. NOTICE: you should be caution when setting this argument because it might cause unexpected program behaviors when this path is set to an unsafe directory.
 open_tracer: false                                          # whether to open the tracer to trace the changes during process. It might take more time when opening tracer
diff --git a/data_juicer/config/config.py b/data_juicer/config/config.py
index 76a20b786..71f871f10 100644
--- a/data_juicer/config/config.py
+++ b/data_juicer/config/config.py
@@ -230,6 +230,12 @@ def init_configs(args: Optional[List[str]] = None):
         help='The compression method of the cache file, which can be'
         'specified in ["gzip", "zstd", "lz4"]. If this parameter is'
         'None, the cache file will not be compressed.')
+    parser.add_argument(
+        '--open_monitor',
+        type=bool,
+        default=True,
+        help='Whether to open the monitor to trace resource utilization for '
+        'each OP during data processing. It\'s True in default.')
     parser.add_argument(
         '--use_checkpoint',
         type=bool,
diff --git a/data_juicer/core/data.py b/data_juicer/core/data.py
index d1a77b581..361f6e8a0 100644
--- a/data_juicer/core/data.py
+++ b/data_juicer/core/data.py
@@ -164,13 +164,16 @@ def __getitem__(self, key):
             res = super().__getitem__(key)
         return nested_obj_factory(res)
 
-    def process(self,
-                operators,
-                *,
-                work_dir=None,
-                exporter=None,
-                checkpointer=None,
-                tracer=None):
+    def process(
+        self,
+        operators,
+        *,
+        work_dir=None,
+        exporter=None,
+        checkpointer=None,
+        tracer=None,
+        open_monitor=True,
+    ):
         if operators is None:
             return self
 
@@ -179,7 +182,8 @@ def process(self,
         unforkable_operators = set(UNFORKABLE.modules.keys())
 
         # resource utilization monitor
-        resource_util_list = []
+        if open_monitor:
+            resource_util_list = []
 
         dataset = self
         try:
@@ -196,12 +200,16 @@ def process(self,
                     'exporter': exporter,
                     'tracer': tracer,
                 }
-                dataset, resource_util_per_op = Monitor.monitor_func(
-                    op.run, args=run_args)
+                if open_monitor:
+                    dataset, resource_util_per_op = Monitor.monitor_func(
+                        op.run, args=run_args)
+                else:
+                    dataset = op.run(**run_args)
                 # record processed ops
                 if checkpointer is not None:
                     checkpointer.record(op._op_cfg)
-                resource_util_list.append(resource_util_per_op)
+                if open_monitor:
+                    resource_util_list.append(resource_util_per_op)
                 end = time()
                 logger.info(f'OP [{op._name}] Done in {end - start:.3f}s. '
                             f'Left {len(dataset)} samples.')
@@ -215,7 +223,10 @@ def process(self,
                             'last op...')
                 dataset.cleanup_cache_files()
                 checkpointer.save_ckpt(dataset)
-            if work_dir:
+            if work_dir and open_monitor:
+                # get the analyzed version
+                resource_util_list = Monitor.analyze_resource_util_list(
+                    resource_util_list)
                 monitor_dir = os.path.join(work_dir, 'monitor')
                 os.makedirs(monitor_dir, exist_ok=True)
                 with open(os.path.join(monitor_dir, 'monitor.json'),
diff --git a/data_juicer/core/executor.py b/data_juicer/core/executor.py
index d9445dad0..f78059247 100644
--- a/data_juicer/core/executor.py
+++ b/data_juicer/core/executor.py
@@ -193,11 +193,14 @@ def run(self,
         # - If checkpoint is open, clean the cache files after each process
         logger.info('Processing data...')
         tstart = time()
-        dataset = dataset.process(ops,
-                                  work_dir=self.work_dir,
-                                  exporter=self.exporter,
-                                  checkpointer=self.ckpt_manager,
-                                  tracer=self.tracer)
+        dataset = dataset.process(
+            ops,
+            work_dir=self.work_dir,
+            exporter=self.exporter,
+            checkpointer=self.ckpt_manager,
+            tracer=self.tracer,
+            open_monitor=self.cfg.open_monitor,
+        )
         tend = time()
         logger.info(f'All OPs are done in {tend - tstart:.3f}s.')
 
diff --git a/data_juicer/core/monitor.py b/data_juicer/core/monitor.py
index 67f8f62a5..0210e3732 100644
--- a/data_juicer/core/monitor.py
+++ b/data_juicer/core/monitor.py
@@ -205,7 +205,10 @@ def monitor_func(func, args=None, sample_interval=0.5):
         resource_util_dict = {}
 
         # start monitor
-        ctx = get_context('fork')
+        start_method = 'fork'
+        if os.name == 'nt':  # for Windows
+            start_method = 'spawn'
+        ctx = get_context(start_method)
         with ctx.Manager() as manager:
             mdict = manager.dict()
             mdict['stop'] = False
diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py
index eb521e619..305145e82 100644
--- a/data_juicer/utils/model_utils.py
+++ b/data_juicer/utils/model_utils.py
@@ -51,6 +51,11 @@
     'punkt.*.pickle':
     'https://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/'
     'data_juicer/models/',
+
+    # ram
+    'ram_plus_swin_large_14m.pth':
+    'http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/models/'
+    'ram_plus_swin_large_14m.pth',
 }
 
 
diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
index 6f9710165..01f0bdb0a 100644
--- a/docs/Operators_ZH.md
+++ b/docs/Operators_ZH.md
@@ -119,42 +119,42 @@ Data-Juicer 中的算子分为以下 5 种类型。
 
 ## Filter <a name="filter"/>
 
-| 算子                                | 标签                                                                                                                                                                                                                                                                                                                        | 描述                                                                        | 源码                                                                     | 单测样例                                                                 |
-|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------|--------------------------------------------------------------------------|--------------------------------------------------------------------------|
-| alphanumeric_filter                 | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留字母数字比例在指定范围内的样本                                          | [code](../data_juicer/ops/filter/alphanumeric_filter.py)                 | [tests](../tests/ops/filter/test_alphanumeric_filter.py)                 |
-| audio_duration_filter               | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic)                                                                                                                                                                                                                                                           | 保留包含音频的时长在指定范围内的样本                                        | [code](../data_juicer/ops/filter/audio_duration_filter.py)               | [tests](../tests/ops/filter/test_audio_duration_filter.py)               |
-| audio_nmf_snr_filter                | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic)                                                                                                                                                                                                                                                           | 保留包含音频信噪比SNR（基于非负矩阵分解方法NMF计算）在指定范围内的样本      | [code](../data_juicer/ops/filter/audio_nmf_snr_filter.py)                | [tests](../tests/ops/filter/test_audio_nmf_snr_filter.py)                |
-| audio_size_filter                   | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic)                                                                                                                                                                                                                                                           | 保留包含音频的大小（bytes）在指定范围内的样本                               | [code](../data_juicer/ops/filter/audio_size_filter.py)                   | [tests](../tests/ops/filter/test_audio_size_filter.py)                   |
-| average_line_length_filter          | ![Code](https://img.shields.io/badge/Code-590F08?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 保留平均行长度在指定范围内的样本                                            | [code](../data_juicer/ops/filter/average_line_length_filter.py)          | [tests](../tests/ops/filter/test_average_line_length_filter.py)          |
-| character_repetition_filter         | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留 char-level n-gram 重复比率在指定范围内的样本                           | [code](../data_juicer/ops/filter/character_repetition_filter.py)         | [tests](../tests/ops/filter/test_character_repetition_filter.py)         |
-| flagged_words_filter                | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留使标记字比率保持在指定阈值以下的样本                                    | [code](../data_juicer/ops/filter/flagged_words_filter.py)                | [tests](../tests/ops/filter/test_flagged_words_filter.py)                |
-| image_aesthetics_filter             | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留包含美学分数在指定范围内的图像的样本                                    | [code](../data_juicer/ops/filter/image_aesthetics_filter.py)             | [tests](../tests/ops/filter/test_image_aesthetics_filter.py)             |
-| image_aspect_ratio_filter           | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片的宽高比在指定范围内的样本                              | [code](../data_juicer/ops/filter/image_aspect_ratio_filter.py)           | [tests](../tests/ops/filter/test_image_aspect_ratio_filter.py)           |
-| image_face_count_filter             | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片中检测到的人脸数目在指定范围内的样本                    | [code](../data_juicer/ops/filter/image_face_count_filter.py)             | [tests](../tests/ops/filter/test_image_face_count_filter.py)             |
-| image_face_ratio_filter             | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片的最大脸部区域在指定范围内的样本                        | [code](../data_juicer/ops/filter/image_face_ratio_filter.py)             | [tests](../tests/ops/filter/test_image_face_ratio_filter.py)             |
-| image_nsfw_filter                   | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留包含NSFW分数在指定阈值之下的图像的样本                                  | [code](../data_juicer/ops/filter/image_nsfw_filter.py)                   | [tests](../tests/ops/filter/test_image_nsfw_filter.py)                   |
-| image_pair_similarity_filter        | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留图像特征余弦相似度(基于CLIP模型)在指定范围内的样本                      | [code](../data_juicer/ops/filter/image_pair_similarity_filter.py)        | [tests](../tests/ops/filter/test_image_pair_similarity_filter.py)        |
-| image_shape_filter                  | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片的形状（即宽和高）在指定范围内的样本                    | [code](../data_juicer/ops/filter/image_shape_filter.py)                  | [tests](../tests/ops/filter/test_image_shape_filter.py)                  |
-| image_size_filter                   | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片的大小（bytes）在指定范围内的样本                       | [code](../data_juicer/ops/filter/image_size_filter.py)                   | [tests](../tests/ops/filter/test_image_size_filter.py)                   |
-| image_text_matching_filter          | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                   | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本                   | [code](../data_juicer/ops/filter/image_text_matching_filter.py)          | [tests](../tests/ops/filter/test_image_text_matching_filter.py)          |
-| image_text_similarity_filter        | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                   | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本               | [code](../data_juicer/ops/filter/image_text_similarity_filter.py)        | [tests](../tests/ops/filter/test_image_text_similarity_filter.py)        |
-| image_watermark_filter              | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留包含有水印概率在指定阈值之下的图像的样本                                | [code](../data_juicer/ops/filter/image_watermark_filter.py)              | [tests](../tests/ops/filter/test_image_watermark_filter.py)              |
-| language_id_score_filter            | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留特定语言的样本，通过预测的置信度得分来判断                              | [code](../data_juicer/ops/filter/language_id_score_filter.py)            | [tests](../tests/ops/filter/test_language_id_score_filter.py)            |
-| maximum_line_length_filter          | ![Code](https://img.shields.io/badge/Code-590F08?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 保留最大行长度在指定范围内的样本                                            | [code](../data_juicer/ops/filter/maximum_line_length_filter.py)          | [tests](../tests/ops/filter/test_maximum_line_length_filter.py)          |
-| perplexity_filter                   | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留困惑度低于指定阈值的样本                                                | [code](../data_juicer/ops/filter/perplexity_filter.py)                   | [tests](../tests/ops/filter/test_perplexity_filter.py)                   |
-| phrase_grounding_recall_filter      | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                   | 保留从文本中提取的名词短语在图像中的定位召回率在一定范围内的样本            | [code](../data_juicer/ops/filter/phrase_grounding_recall_filter.py)      | [tests](../tests/ops/filter/test_phrase_grounding_recall_filter.py)      |
-| special_characters_filter           | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留 special-char 比率的在指定范围内的样本                                  | [code](../data_juicer/ops/filter/special_characters_filter.py)           | [tests](../tests/ops/filter/test_special_characters_filter.py)           |
-| specified_field_filter              | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 根据字段过滤样本，要求字段的值处于指定目标中                                | [code](../data_juicer/ops/filter/specified_field_filter.py)              | [tests](../tests/ops/filter/test_specified_field_filter.py)              |
-| specified_numeric_field_filter      | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 根据字段过滤样本，要求字段的值处于指定范围（针对数字类型）                  | [code](../data_juicer/ops/filter/specified_numeric_field_filter.py)      | [tests](../tests/ops/filter/test_specified_numeric_field_filter.py)      |
-| stopwords_filter                    | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留停用词比率高于指定阈值的样本                                            | [code](../data_juicer/ops/filter/stopwords_filter.py)                    | [tests](../tests/ops/filter/test_stopwords_filter.py)                    |
-| suffix_filter                       | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                                                                               | 保留包含特定后缀的样本                                                      | [code](../data_juicer/ops/filter/suffix_filter.py)                       | [tests](../tests/ops/filter/test_suffix_filter.py)                       |
-| text_action_filter                  | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留文本部分包含动作的样本                                                  | [code](../data_juicer/ops/filter/text_action_filter.py)                  | [tests](../tests/ops/filter/test_text_action_filter.py)                  |
-| text_entity_dependency_filter       | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留文本部分的依存树中具有非独立实体的样本                                  | [code](../data_juicer/ops/filter/text_entity_dependency_filter.py)       | [tests](../tests/ops/filter/test_text_entity_dependency_filter.py)       |
-| text_length_filter                  | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留总文本长度在指定范围内的样本                                            | [code](../data_juicer/ops/filter/text_length_filter.py)                  | [tests](../tests/ops/filter/test_text_length_filter.py)                  |
-| token_num_filter                    | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留token数在指定范围内的样本                                               | [code](../data_juicer/ops/filter/token_num_filter.py)                    | [tests](../tests/ops/filter/test_token_num_filter.py)                    |
-| video_aspect_ratio_filter           | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留包含视频的宽高比在指定范围内的样本                                      | [code](../data_juicer/ops/filter/video_aesthetics_filter.py)             | [tests](../tests/ops/filter/test_video_aesthetics_filter.py)             |
-| video_duration_filter               | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic)                                                                                                                                                                                                                                                           | 保留包含视频的时长在指定范围内的样本                                        | [code](../data_juicer/ops/filter/video_aspect_ratio_filter.py)           | [tests](../tests/ops/filter/test_video_aspect_ratio_filter.py)           |
-| video_aesthetics_filter             | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic)                                                                                                                                                                                                                                                           | 保留指定帧的美学分数在指定范围内的样本                                      | [code](../data_juicer/ops/filter/video_duration_filter.py)               | [tests](../tests/ops/filter/test_video_duration_filter.py)               |
+| 算子                                  | 标签                                                                                                                                                                                                                                                                                                                          | 描述                                        | 源码                                                                       | 单测样例                                                                     |
+|-------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------------------------------|--------------------------------------------------------------------------|--------------------------------------------------------------------------|
+| alphanumeric_filter                 | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留字母数字比例在指定范围内的样本                         | [code](../data_juicer/ops/filter/alphanumeric_filter.py)                 | [tests](../tests/ops/filter/test_alphanumeric_filter.py)                 |
+| audio_duration_filter               | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic)                                                                                                                                                                                                                                                           | 保留包含音频的时长在指定范围内的样本                        | [code](../data_juicer/ops/filter/audio_duration_filter.py)               | [tests](../tests/ops/filter/test_audio_duration_filter.py)               |
+| audio_nmf_snr_filter                | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic)                                                                                                                                                                                                                                                           | 保留包含音频信噪比SNR（基于非负矩阵分解方法NMF计算）在指定范围内的样本    | [code](../data_juicer/ops/filter/audio_nmf_snr_filter.py)                | [tests](../tests/ops/filter/test_audio_nmf_snr_filter.py)                |
+| audio_size_filter                   | ![Audio](https://img.shields.io/badge/Audio-0DA64F?style=plastic)                                                                                                                                                                                                                                                           | 保留包含音频的大小（bytes）在指定范围内的样本                 | [code](../data_juicer/ops/filter/audio_size_filter.py)                   | [tests](../tests/ops/filter/test_audio_size_filter.py)                   |
+| average_line_length_filter          | ![Code](https://img.shields.io/badge/Code-590F08?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 保留平均行长度在指定范围内的样本                          | [code](../data_juicer/ops/filter/average_line_length_filter.py)          | [tests](../tests/ops/filter/test_average_line_length_filter.py)          |
+| character_repetition_filter         | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留 char-level n-gram 重复比率在指定范围内的样本        | [code](../data_juicer/ops/filter/character_repetition_filter.py)         | [tests](../tests/ops/filter/test_character_repetition_filter.py)         |
+| flagged_words_filter                | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留使标记字比率保持在指定阈值以下的样本                      | [code](../data_juicer/ops/filter/flagged_words_filter.py)                | [tests](../tests/ops/filter/test_flagged_words_filter.py)                |
+| image_aesthetics_filter             | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留包含美学分数在指定范围内的图像的样本                      | [code](../data_juicer/ops/filter/image_aesthetics_filter.py)             | [tests](../tests/ops/filter/test_image_aesthetics_filter.py)             |
+| image_aspect_ratio_filter           | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片的宽高比在指定范围内的样本                   | [code](../data_juicer/ops/filter/image_aspect_ratio_filter.py)           | [tests](../tests/ops/filter/test_image_aspect_ratio_filter.py)           |
+| image_face_count_filter             | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片中检测到的人脸数目在指定范围内的样本              | [code](../data_juicer/ops/filter/image_face_count_filter.py)             | [tests](../tests/ops/filter/test_image_face_count_filter.py)             |
+| image_face_ratio_filter             | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片的最大脸部区域在指定范围内的样本                | [code](../data_juicer/ops/filter/image_face_ratio_filter.py)             | [tests](../tests/ops/filter/test_image_face_ratio_filter.py)             |
+| image_nsfw_filter                   | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留包含NSFW分数在指定阈值之下的图像的样本                   | [code](../data_juicer/ops/filter/image_nsfw_filter.py)                   | [tests](../tests/ops/filter/test_image_nsfw_filter.py)                   |
+| image_pair_similarity_filter        | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留图像特征余弦相似度(基于CLIP模型)在指定范围内的样本            | [code](../data_juicer/ops/filter/image_pair_similarity_filter.py)        | [tests](../tests/ops/filter/test_image_pair_similarity_filter.py)        |
+| image_shape_filter                  | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片的形状（即宽和高）在指定范围内的样本              | [code](../data_juicer/ops/filter/image_shape_filter.py)                  | [tests](../tests/ops/filter/test_image_shape_filter.py)                  |
+| image_size_filter                   | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic)                                                                                                                                                                                                                                                           | 保留样本中包含的图片的大小（bytes）在指定范围内的样本             | [code](../data_juicer/ops/filter/image_size_filter.py)                   | [tests](../tests/ops/filter/test_image_size_filter.py)                   |
+| image_text_matching_filter          | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                   | 保留图像-文本的分类匹配分(基于BLIP模型)在指定范围内的样本          | [code](../data_juicer/ops/filter/image_text_matching_filter.py)          | [tests](../tests/ops/filter/test_image_text_matching_filter.py)          |
+| image_text_similarity_filter        | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                   | 保留图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本        | [code](../data_juicer/ops/filter/image_text_similarity_filter.py)        | [tests](../tests/ops/filter/test_image_text_similarity_filter.py)        |
+| image_watermark_filter              | ![Image](https://img.shields.io/badge/Image-07B0F2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留包含有水印概率在指定阈值之下的图像的样本                    | [code](../data_juicer/ops/filter/image_watermark_filter.py)              | [tests](../tests/ops/filter/test_image_watermark_filter.py)              |
+| language_id_score_filter            | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留特定语言的样本，通过预测的置信度得分来判断                   | [code](../data_juicer/ops/filter/language_id_score_filter.py)            | [tests](../tests/ops/filter/test_language_id_score_filter.py)            |
+| maximum_line_length_filter          | ![Code](https://img.shields.io/badge/Code-590F08?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 保留最大行长度在指定范围内的样本                          | [code](../data_juicer/ops/filter/maximum_line_length_filter.py)          | [tests](../tests/ops/filter/test_maximum_line_length_filter.py)          |
+| perplexity_filter                   | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留困惑度低于指定阈值的样本                            | [code](../data_juicer/ops/filter/perplexity_filter.py)                   | [tests](../tests/ops/filter/test_perplexity_filter.py)                   |
+| phrase_grounding_recall_filter      | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                   | 保留从文本中提取的名词短语在图像中的定位召回率在一定范围内的样本          | [code](../data_juicer/ops/filter/phrase_grounding_recall_filter.py)      | [tests](../tests/ops/filter/test_phrase_grounding_recall_filter.py)      |
+| special_characters_filter           | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留 special-char 比率的在指定范围内的样本              | [code](../data_juicer/ops/filter/special_characters_filter.py)           | [tests](../tests/ops/filter/test_special_characters_filter.py)           |
+| specified_field_filter              | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 根据字段过滤样本，要求字段的值处于指定目标中                    | [code](../data_juicer/ops/filter/specified_field_filter.py)              | [tests](../tests/ops/filter/test_specified_field_filter.py)              |
+| specified_numeric_field_filter      | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 根据字段过滤样本，要求字段的值处于指定范围（针对数字类型）             | [code](../data_juicer/ops/filter/specified_numeric_field_filter.py)      | [tests](../tests/ops/filter/test_specified_numeric_field_filter.py)      |
+| stopwords_filter                    | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留停用词比率高于指定阈值的样本                          | [code](../data_juicer/ops/filter/stopwords_filter.py)                    | [tests](../tests/ops/filter/test_stopwords_filter.py)                    |
+| suffix_filter                       | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                                                                               | 保留包含特定后缀的样本                               | [code](../data_juicer/ops/filter/suffix_filter.py)                       | [tests](../tests/ops/filter/test_suffix_filter.py)                       |
+| text_action_filter                  | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留文本部分包含动作的样本                             | [code](../data_juicer/ops/filter/text_action_filter.py)                  | [tests](../tests/ops/filter/test_text_action_filter.py)                  |
+| text_entity_dependency_filter       | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留文本部分的依存树中具有非独立实体的样本                     | [code](../data_juicer/ops/filter/text_entity_dependency_filter.py)       | [tests](../tests/ops/filter/test_text_entity_dependency_filter.py)       |
+| text_length_filter                  | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                               | 保留总文本长度在指定范围内的样本                          | [code](../data_juicer/ops/filter/text_length_filter.py)                  | [tests](../tests/ops/filter/test_text_length_filter.py)                  |
+| token_num_filter                    | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic) | 保留token数在指定范围内的样本                         | [code](../data_juicer/ops/filter/token_num_filter.py)                    | [tests](../tests/ops/filter/test_token_num_filter.py)                    |
+| video_aesthetics_filter             | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                             | 保留指定帧的美学分数在指定范围内的样本                       | [code](../data_juicer/ops/filter/video_aesthetics_filter.py)             | [tests](../tests/ops/filter/test_video_aesthetics_filter.py)             |
+| video_aspect_ratio_filter           | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic)                                                                                                                                                                                                                                                           | 保留包含视频的宽高比在指定范围内的样本                       | [code](../data_juicer/ops/filter/video_aspect_ratio_filter.py)           | [tests](../tests/ops/filter/test_video_aspect_ratio_filter.py)           |
+| video_duration_filter               | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic)                                                                                                                                                                                                                                                           | 保留包含视频的时长在指定范围内的样本                        | [code](../data_juicer/ops/filter/video_duration_filter.py)               | [tests](../tests/ops/filter/test_video_duration_filter.py)               |
 | video_frames_text_similarity_filter | ![Multimodal](https://img.shields.io/badge/Multimodal-F25922?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)                                                                                                                                                                                   | 保留视频中指定帧的图像-文本的特征余弦相似度(基于CLIP模型)在指定范围内的样本 | [code](../data_juicer/ops/filter/video_frames_text_similarity_filter.py) | [tests](../tests/ops/filter/test_video_frames_text_similarity_filter.py) |
 | video_motion_score_filter           | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic)                                                                                                                                                                                                                                                           | 保留包含视频的运动分数（基于稠密光流）在指定范围内的样本                    | [code](../data_juicer/ops/filter/video_motion_score_filter.py)           | [tests](../tests/ops/filter/test_video_motion_score_filter.py)           |
 | video_motion_score_raft_filter      | ![Video](https://img.shields.io/badge/Video-F2B138?style=plastic)                                                                                                                                                                                                                                                           | 保留包含视频的运动分数（基于 RAFT 模型估计的稠密光流）在指定范围内的样本      | [code](../data_juicer/ops/filter/video_motion_score_raft_raft_filter.py)      | [tests](../tests/ops/filter/test_video_motion_score_filter.py)      |
diff --git a/environments/dev_requires.txt b/environments/dev_requires.txt
index 9793d5746..0ecd058c4 100644
--- a/environments/dev_requires.txt
+++ b/environments/dev_requires.txt
@@ -4,3 +4,4 @@ sphinx
 sphinx-autobuild
 sphinx_rtd_theme
 recommonmark
+wandb
diff --git a/environments/science_requires.txt b/environments/science_requires.txt
index f1e613126..10ea3b86e 100644
--- a/environments/science_requires.txt
+++ b/environments/science_requires.txt
@@ -11,7 +11,7 @@ selectolax
 nlpaug
 nlpcda
 nltk<3.9
-transformers>=4.37
+transformers>=4.47.0
 transformers_stream_generator
 einops
 accelerate
diff --git a/tests/benchmark_performance/configs/audio.yaml b/tests/benchmark_performance/configs/audio.yaml
new file mode 100644
index 000000000..848c537b0
--- /dev/null
+++ b/tests/benchmark_performance/configs/audio.yaml
@@ -0,0 +1,14 @@
+# The config file for performance benchmark to measure the processing speed for
+# the current Data-Juicer system. OPs are selected according to their tags and
+# types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md)
+
+project_name: 'performance-benchmark-audio'
+dataset_path: 'perf_bench_data/audio/audio-10k.jsonl'
+export_path: 'outputs/performance_benchmark_audio/res.jsonl'
+np: 16
+use_cache: false
+
+process:
+  - audio_duration_filter:
+  - audio_nmf_snr_filter:
+  - audio_size_filter:
diff --git a/tests/benchmark_performance/configs/image.yaml b/tests/benchmark_performance/configs/image.yaml
new file mode 100644
index 000000000..3ce03be53
--- /dev/null
+++ b/tests/benchmark_performance/configs/image.yaml
@@ -0,0 +1,23 @@
+# The config file for performance benchmark to measure the processing speed for
+# the current Data-Juicer system. OPs are selected according to their tags and
+# types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md)
+
+project_name: 'performance-benchmark-image'
+dataset_path: 'perf_bench_data/image/10k.jsonl'
+export_path: 'outputs/performance_benchmark_image/res.jsonl'
+np: 16
+use_cache: false
+
+process:
+  - image_aesthetics_filter:
+      hf_scorer_model: 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE'
+      min_score: 0.0
+      mem_required: '1500MB'
+  - image_captioning_mapper:
+      hf_img2seq: 'Salesforce/blip2-opt-2.7b'
+      caption_num: 1
+      keep_original_sample: false
+      mem_required: '16GB'
+  - image_shape_filter:
+  - image_blur_mapper:
+  - image_deduplicator:
diff --git a/tests/benchmark_performance/configs/text.yaml b/tests/benchmark_performance/configs/text.yaml
new file mode 100644
index 000000000..8b39bbeb8
--- /dev/null
+++ b/tests/benchmark_performance/configs/text.yaml
@@ -0,0 +1,21 @@
+# The config file for performance benchmark to measure the processing speed for
+# the current Data-Juicer system. OPs are selected according to their tags and
+# types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md)
+
+project_name: 'performance-benchmark-text'
+dataset_path: 'perf_bench_data/text/wiki-10k.jsonl'
+export_path: 'outputs/performance_benchmark_text/res.jsonl'
+np: 16
+use_cache: false
+
+process:
+  - whitespace_normalization_mapper:
+  - token_num_filter:
+      hf_tokenizer: 'EleutherAI/pythia-6.9b-deduped'
+      min_num: 0
+  - document_deduplicator:
+      lowercase: false
+      ignore_non_character: false
+  - topk_specified_field_selector:
+      field_key: '__dj__stats__.num_token'
+      topk: 1000
diff --git a/tests/benchmark_performance/configs/video.yaml b/tests/benchmark_performance/configs/video.yaml
new file mode 100644
index 000000000..a7df19639
--- /dev/null
+++ b/tests/benchmark_performance/configs/video.yaml
@@ -0,0 +1,20 @@
+# The config file for performance benchmark to measure the processing speed for
+# the current Data-Juicer system. OPs are selected according to their tags and
+# types (https://github.com/modelscope/data-juicer/blob/main/docs/Operators.md)
+
+project_name: 'performance-benchmark-video'
+dataset_path: 'perf_bench_data/video/msr_vtt_train.jsonl'
+export_path: 'outputs/performance_benchmark_video/res.jsonl'
+np: 16
+use_cache: false
+
+process:
+  - video_nsfw_filter:
+      hf_nsfw_model: 'Falconsai/nsfw_image_detection'
+      score_threshold: 1.0
+      mem_required: '1GB'
+  - video_tagging_from_frames_mapper:
+  - video_duration_filter:
+  - video_split_by_key_frame_mapper:
+      keep_original_sample: false
+  - video_deduplicator:
diff --git a/tests/benchmark_performance/report.py b/tests/benchmark_performance/report.py
new file mode 100644
index 000000000..e53afa63a
--- /dev/null
+++ b/tests/benchmark_performance/report.py
@@ -0,0 +1,126 @@
+import wandb
+import fire
+import os
+import json
+import yaml
+import regex as re
+from loguru import logger
+
+PROJECT = 'Data-Juicer Reports'
+RUN_NAME = 'Performance Benchmark -- %s'
+MODALITIES = {'text', 'image', 'video', 'audio'}
+DIFF_TH = 0.1
+
+def get_run_id(project, run_name, entity='dail'):
+    api = wandb.Api()
+    runs = api.runs(path=f'{entity}/{project}')
+    for run in runs:
+        if run.name == run_name:
+            return run.id
+    return ''
+
+def init_run(modality, config=None):
+    # get the run object for specified modality
+    # if it's not existed, create one
+    # if it's existed, get the run id and resume from it
+    run_id = get_run_id(PROJECT, RUN_NAME % modality)
+    if run_id == '':
+        # no existing run, create one
+        run = wandb.init(project=PROJECT,
+                         config=config,
+                         tags=['performance benchmark', modality],
+                         name=RUN_NAME % modality)
+        run_id = get_run_id(PROJECT, RUN_NAME % modality)
+    else:
+        run = wandb.init(project=PROJECT,
+                         id=run_id,
+                         resume='must')
+    return run, run_id
+
+def main():
+    wandb.login()
+    for modality in MODALITIES:
+        logger.info(f'--------------- {modality} ---------------')
+        work_dir = f'outputs/performance_benchmark_{modality}/'
+
+        # read config
+        with open(os.path.join(work_dir, f'{modality}.yaml')) as fin:
+            config = yaml.load(fin, yaml.FullLoader)
+
+        # init the wandb run
+        run, run_id = init_run(modality, config)
+
+        # collect results from logs
+        log_pt = r'export_(.*?)_time_(\d*?).txt'
+        log_dir = os.path.join(work_dir, 'log')
+        log_files = os.listdir(log_dir)
+        log_file = None
+        for fn in log_files:
+            if re.match(log_pt, fn):
+                log_file = fn
+                break
+        if log_file is None:
+            logger.warning('No log files found.')
+            exit()
+        log_file = os.path.join(log_dir, log_file)
+        with open(log_file) as fin:
+            log_content = fin.read()
+        op_pt = r'OP \[(.*?)\] Done in (.*?)s'
+        total_pt = r'All OPs are done in (.*?)s'
+        op_data = re.findall(op_pt, log_content)
+        ops = [it[0] for it in op_data]
+        total_data = re.findall(total_pt, log_content)
+
+        res = dict(op_data)
+        res['total_time'] = total_data[0]
+        res = {key: {'time': float(res[key])} for key in res}
+
+        # collect resource utilization from monitor logs
+        monitor_file = os.path.join(work_dir, 'monitor', 'monitor.json')
+        with open(monitor_file) as fin:
+            monitor_res = json.load(fin)
+        assert len(monitor_res) == len(ops)
+        for op, resource_util_dict in zip(ops, monitor_res):
+            res[op].update(resource_util_dict['resource_analysis'])
+
+        # upload results and finish the run
+        upload_res = {
+            modality: res
+        }
+        run.log(upload_res)
+        run.finish()
+
+        # compare with the last run
+        api = wandb.Api()
+        api_run = api.run(f'{PROJECT}/{run_id}')
+        run_history = api_run.history()
+        if len(run_history) < 2:
+            continue
+        last_record = run_history.iloc[-2]
+
+        for op_name, time in op_data:
+            last_time = last_record[f'{modality}.{op_name}.time']
+            this_time = res[op_name]['time']
+            dif = (this_time - last_time) / last_time
+            if dif > 0.1:
+                logger.warning(f'Time cost for OP {[op_name]} increased by '
+                               f'{dif * 100}% (> 10%). Before-{last_time} vs. '
+                               f'Now-{this_time}')
+            else:
+                logger.info(f'Time cost for OP {[op_name]} increased by '
+                            f'{dif * 100}%. Before-{last_time} vs. '
+                            f'Now-{this_time}')
+        last_total = last_record[f'{modality}.total_time.time']
+        this_total = res['total_time']['time']
+        dif_total = (this_total - last_total) / last_total
+        if dif_total > 0.1:
+            logger.warning(f'Total time cost increased by {dif_total * 100}% '
+                           f'(> 10%). Before-{last_total} vs. '
+                           f'Now-{this_total}')
+        else:
+            logger.info(f'Total time cost increased by {dif_total * 100}%. '
+                        f'Before-{last_total} vs. Now-{this_total}')
+
+
+if __name__ == '__main__':
+    fire.Fire(main)
diff --git a/tests/benchmark_performance/run.sh b/tests/benchmark_performance/run.sh
new file mode 100644
index 000000000..4104967b8
--- /dev/null
+++ b/tests/benchmark_performance/run.sh
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+# setup wandb configs
+export WANDB_BASE_URL=$1
+export WANDB_API_KEY=$2
+
+BENCH_PATH=$(cd "$(dirname "$0")"; pwd)
+RELATIVE_DJ_PATH=../..
+MODALITIES=("text" "image" "video" "audio")
+
+cd $BENCH_PATH
+
+# 1. prepare dataset
+wget http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxvf perf_bench_data.tar.gz
+
+# 2. run the benchmark
+for modality in ${MODALITIES[@]}
+do
+    python $RELATIVE_DJ_PATH/tools/process_data.py --config configs/$modality.yaml
+done
+
+# 3. collect & upload benchmark results
+python report.py
+
+# 4. clear resources
+rm -rf perf_bench_data.tar.gz
+rm -rf perf_bench_data/