From 7d4a37bca5b4116bf741a9317f9f2acf8935efa5 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 6 Dec 2024 15:14:04 +0800
Subject: [PATCH 1/8] * refine perf bench workflow * fix wrong var in sphinx
 docs

---
 .github/workflows/deploy_sphinx_docs.yml | 4 ++--
 .github/workflows/perf-bench.yml         | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/deploy_sphinx_docs.yml b/.github/workflows/deploy_sphinx_docs.yml
index 9c8ae89a0..e5c71385d 100644
--- a/.github/workflows/deploy_sphinx_docs.yml
+++ b/.github/workflows/deploy_sphinx_docs.yml
@@ -15,10 +15,10 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - name: Setup Python ${{ matrix.python-version }}
+    - name: Setup Python ${{ matrix.python_version }}
       uses: actions/setup-python@master
       with:
-        python_version: ${{ matrix.python-version }}
+        python_version: ${{ matrix.python_version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
diff --git a/.github/workflows/perf-bench.yml b/.github/workflows/perf-bench.yml
index 2a4d6658b..2a50d9360 100644
--- a/.github/workflows/perf-bench.yml
+++ b/.github/workflows/perf-bench.yml
@@ -16,8 +16,8 @@ env:
   ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
 
 jobs:
-  unittest-single:
-    runs-on: [self-hosted, linux]
+  perf_bench:
+    runs-on: [GPU, unittest]
     environment: Testing
     steps:
     - uses: actions/checkout@v3

From 95e24e958a77e490c4661eac7a27022938714e2e Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 6 Dec 2024 15:18:31 +0800
Subject: [PATCH 2/8] * refine perf bench workflow

---
 .github/workflows/perf-bench.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/perf-bench.yml b/.github/workflows/perf-bench.yml
index 2a50d9360..4094070db 100644
--- a/.github/workflows/perf-bench.yml
+++ b/.github/workflows/perf-bench.yml
@@ -42,7 +42,7 @@ jobs:
     - name: Run performance benchmark standalone
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker
       run: |
-        docker compose exec ray-head python tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
+        docker compose exec ray-head bash tests/benchmark_performance/run.sh ${{ secrets.INTERNAL_WANDB_URL }} ${{ secrets.INTERNAL_WANDB_API_KEY }}
 
     - name: Remove docker compose
       working-directory: dj-${{ github.run_id }}/.github/workflows/docker

From 9d38dc502aba24c190043e2939b576bbb3b7dfa8 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 6 Dec 2024 15:22:55 +0800
Subject: [PATCH 3/8] * fix wrong var in sphinx docs

---
 .github/workflows/deploy_sphinx_docs.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/deploy_sphinx_docs.yml b/.github/workflows/deploy_sphinx_docs.yml
index e5c71385d..6012293d5 100644
--- a/.github/workflows/deploy_sphinx_docs.yml
+++ b/.github/workflows/deploy_sphinx_docs.yml
@@ -15,10 +15,10 @@ jobs:
     steps:
     - name: Checkout
       uses: actions/checkout@v4
-    - name: Setup Python ${{ matrix.python_version }}
+    - name: Setup Python ${{ matrix.python-version }}
       uses: actions/setup-python@master
       with:
-        python_version: ${{ matrix.python_version }}
+        python-version: ${{ matrix.python-version }}
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip

From 5715d93934f7b1eb80e5e4d890dadfd002114fb0 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 6 Dec 2024 16:13:28 +0800
Subject: [PATCH 4/8] * set python version matrix to include only 3.9 and 3.10

---
 .github/workflows/deploy_sphinx_docs.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/deploy_sphinx_docs.yml b/.github/workflows/deploy_sphinx_docs.yml
index 6012293d5..5cf0205ae 100644
--- a/.github/workflows/deploy_sphinx_docs.yml
+++ b/.github/workflows/deploy_sphinx_docs.yml
@@ -12,6 +12,9 @@ on:
 jobs:
   pages:
     runs-on: ubuntu-20.04
+    strategy:
+      matrix:
+        python-version: [ "3.9", "3.10" ]
     steps:
     - name: Checkout
       uses: actions/checkout@v4

From 8f1b03b78245816a3a72e6252e3c42b83a7dcb71 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 6 Dec 2024 16:38:12 +0800
Subject: [PATCH 5/8] * hide unnecessary logs

---
 tests/benchmark_performance/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/benchmark_performance/run.sh b/tests/benchmark_performance/run.sh
index 4104967b8..1ec839d57 100644
--- a/tests/benchmark_performance/run.sh
+++ b/tests/benchmark_performance/run.sh
@@ -11,7 +11,7 @@ MODALITIES=("text" "image" "video" "audio")
 cd $BENCH_PATH
 
 # 1. prepare dataset
-wget http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxvf perf_bench_data.tar.gz
+wget -q http://dail-wlcb.oss-cn-wulanchabu.aliyuncs.com/data_juicer/perf_bench_data/perf_bench_data.tar.gz && tar zxf perf_bench_data.tar.gz
 
 # 2. run the benchmark
 for modality in ${MODALITIES[@]}

From 12d69053417ba7299c7d94da97227247ee4d6c34 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Fri, 6 Dec 2024 17:37:47 +0800
Subject: [PATCH 6/8] * update mem_required for image tagging models

---
 configs/config_all.yaml                        | 3 +++
 tests/benchmark_performance/configs/video.yaml | 1 +
 2 files changed, 4 insertions(+)

diff --git a/configs/config_all.yaml b/configs/config_all.yaml
index 4019b66a5..756cadd81 100644
--- a/configs/config_all.yaml
+++ b/configs/config_all.yaml
@@ -212,6 +212,7 @@ process:
       radius: 2                                               # radius of blur kernel
   - image_tagging_mapper:                                   # Mapper to generate image tags.
       tag_field_name: '__dj__image_tags__'                    # the field name to store the tags. It's "__dj__image_tags__" in default.
+      mem_required: '9GB'
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
       sequential: false                                       # whether combine all augmentation methods to a sequence. If it's True, a sample will be augmented by all opened augmentation methods sequentially. If it's False, each opened augmentation method would generate its augmented samples independently.
       aug_num: 1                                              # number of augmented samples to be generated. If `sequential` is True, there will be total aug_num augmented samples generated. If it's False, there will be (aug_num * #opened_aug_method) augmented samples generated.
@@ -382,6 +383,7 @@ process:
       frame_sampling_method: 'all_keyframes'                  # sampling method of extracting frame images from the videos. Should be one of ["all_keyframes", "uniform"]. The former one extracts all key frames and the latter one extract specified number of frames uniformly from the video. Default: "all_keyframes".
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
+      mem_required: '9GB'
   - whitespace_normalization_mapper:                        # normalize different kinds of whitespaces to English whitespace.
 
   # Filter ops
@@ -614,6 +616,7 @@ process:
       frame_num: 3                                            # the number of frames to be extracted uniformly from the video. Only works when frame_sampling_method is "uniform". If it's 1, only the middle frame will be extracted. If it's 2, only the first and the last frames will be extracted. If it's larger than 2, in addition to the first and the last frames, other frames will be extracted uniformly within the video duration.
       tag_field_name: '__dj__video_frame_tags__'              # the field name to store the tags. It's "__dj__video_frame_tags__" in default.
       any_or_all: any                                         # keep this sample when any/all videos meet the filter condition
+      mem_required: '9GB'
   - words_num_filter:                                       # filter text with number of words out of specific range
       lang: en                                                # sample in which language
       tokenization: false                                     # whether to use model to tokenize documents
diff --git a/tests/benchmark_performance/configs/video.yaml b/tests/benchmark_performance/configs/video.yaml
index a7df19639..28fb3b98a 100644
--- a/tests/benchmark_performance/configs/video.yaml
+++ b/tests/benchmark_performance/configs/video.yaml
@@ -14,6 +14,7 @@ process:
       score_threshold: 1.0
       mem_required: '1GB'
   - video_tagging_from_frames_mapper:
+      mem_required: '9GB'
   - video_duration_filter:
   - video_split_by_key_frame_mapper:
       keep_original_sample: false

From b1b64389a4d477875c9c48a7217fb2f267034df7 Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 9 Dec 2024 11:18:31 +0800
Subject: [PATCH 7/8] * enable unittests for 3 OPs due to dependency

---
 tests/ops/filter/test_audio_duration_filter.py           | 5 +----
 tests/ops/filter/test_audio_nmf_snr_filter.py            | 5 +----
 tests/ops/mapper/test_video_tagging_from_audio_mapper.py | 5 +----
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/tests/ops/filter/test_audio_duration_filter.py b/tests/ops/filter/test_audio_duration_filter.py
index 5b367f0ec..64a5c05c8 100644
--- a/tests/ops/filter/test_audio_duration_filter.py
+++ b/tests/ops/filter/test_audio_duration_filter.py
@@ -5,11 +5,8 @@
 
 from data_juicer.ops.filter.audio_duration_filter import AudioDurationFilter
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, TEST_TAG
 
-# skip due to conflicts when run lazy_load in multiprocessing in librosa
-# tests passed locally.
-@SKIPPED_TESTS.register_module()
 class AudioDurationFilterTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/filter/test_audio_nmf_snr_filter.py b/tests/ops/filter/test_audio_nmf_snr_filter.py
index 384435828..d0dec38b8 100644
--- a/tests/ops/filter/test_audio_nmf_snr_filter.py
+++ b/tests/ops/filter/test_audio_nmf_snr_filter.py
@@ -5,11 +5,8 @@
 
 from data_juicer.ops.filter.audio_nmf_snr_filter import AudioNMFSNRFilter
 from data_juicer.utils.constant import Fields
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-# skip due to conflicts when run lazy_load in multiprocessing in librosa
-# tests passed locally.
-@SKIPPED_TESTS.register_module()
 class AudioNMFSNRFilterTest(DataJuicerTestCaseBase):
 
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
diff --git a/tests/ops/mapper/test_video_tagging_from_audio_mapper.py b/tests/ops/mapper/test_video_tagging_from_audio_mapper.py
index 5cace0b7a..8bbf05933 100644
--- a/tests/ops/mapper/test_video_tagging_from_audio_mapper.py
+++ b/tests/ops/mapper/test_video_tagging_from_audio_mapper.py
@@ -6,11 +6,8 @@
     VideoTaggingFromAudioMapper
 from data_juicer.utils.constant import Fields
 from data_juicer.utils.mm_utils import SpecialTokens
-from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
 
-# skip due to conflicts when run lazy_load in multiprocessing in librosa
-# tests passed locally.
-@SKIPPED_TESTS.register_module()
 class VideoTaggingFromAudioMapperTest(DataJuicerTestCaseBase):
     data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
                              'data')

From 5999cc32581c2d5d3f60b02be448ebda746183dc Mon Sep 17 00:00:00 2001
From: "lielin.hyl" <lielin.hyl@alibaba-inc.com>
Date: Mon, 9 Dec 2024 11:53:08 +0800
Subject: [PATCH 8/8] + add two dependencies by librosa

---
 environments/minimal_requires.txt | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt
index df76b1358..414458edc 100644
--- a/environments/minimal_requires.txt
+++ b/environments/minimal_requires.txt
@@ -4,7 +4,11 @@ pandas
 numpy
 av==13.1.0
 soundfile
+# need to install two dependencies by librosa to avoid lazy_loader error
 librosa>=0.10
+samplerate
+resampy
+# need to install two dependencies by librosa to avoid lazy_loader error
 loguru
 tabulate
 tqdm