add python_lambda_mapper (#492)

* init python_lambda_mapper * set default arg * fix init * support batched & add docs * fix docs * Quick fix for some minor problems (#503) * * remove str conversion for fps para of add_stream func + add requires from librosa to avoid lazy_loader failure during multiprocessing * * remove str conversion for fps para of add_stream func + add requires from librosa to avoid lazy_loader failure during multiprocessing * * install cmake before * * install cmake before * * install cmake before * * update unit test tags * * update unit test tags * * update unit test tags * * update unit test tags * * try to remove samplerate dep * * skip audio duration and audio nmf snr filters * * skip video_tagging_from_frames_filter * * skip video_tagging_from_audios_filter * * skip video_motion_score_raft_filter * fix batch bug (#504) * fix batch bug * fix filter batch * not rank for filter * limit pyav version --------- Co-authored-by: Yilun Huang <[email protected]> Co-authored-by: BeachWang <[email protected]>
modelscope · Dec 5, 2024 · 0fe505e · 0fe505e
1 parent 5a4b1a1
commit 0fe505e
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 15 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -257,6 +257,9 @@ process:
       model_params: {}                                        # Parameters for initializing the API model.
       sampling_params: {}                                     # Extra parameters passed to the API call.
   - punctuation_normalization_mapper:                       # normalize unicode punctuations to English punctuations.
+  - python_lambda_mapper:                                   # executing Python lambda function on data samples.
+      lambda_str: ''                                          # A string representation of the lambda function to be executed on data samples. If empty, the identity function is used.
+      batched: False                                          # A boolean indicating whether to process input data in batches.
   - remove_bibliography_mapper:                             # remove bibliography from Latex text.
   - remove_comments_mapper:                                 # remove comments from Latex text, code, etc.
       doc_type: tex                                           # comment type you want to remove. Only support 'tex' for now.

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -30,6 +30,7 @@
 from .optimize_response_mapper import OptimizeResponseMapper
 from .pair_preference_mapper import PairPreferenceMapper
 from .punctuation_normalization_mapper import PunctuationNormalizationMapper
+from .python_lambda_mapper import PythonLambdaMapper
 from .remove_bibliography_mapper import RemoveBibliographyMapper
 from .remove_comments_mapper import RemoveCommentsMapper
 from .remove_header_mapper import RemoveHeaderMapper
@@ -75,17 +76,17 @@
     'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper',
     'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper',
     'PairPreferenceMapper', 'PunctuationNormalizationMapper',
-    'RemoveBibliographyMapper', 'RemoveCommentsMapper', 'RemoveHeaderMapper',
-    'RemoveLongWordsMapper', 'RemoveNonChineseCharacterlMapper',
-    'RemoveRepeatSentencesMapper', 'RemoveSpecificCharsMapper',
-    'RemoveTableTextMapper', 'RemoveWordsWithIncorrectSubstringsMapper',
-    'ReplaceContentMapper', 'SentenceSplitMapper', 'TextChunkMapper',
-    'VideoCaptioningFromAudioMapper', 'VideoCaptioningFromFramesMapper',
-    'VideoCaptioningFromSummarizerMapper', 'VideoCaptioningFromVideoMapper',
-    'VideoFFmpegWrappedMapper', 'VideoFaceBlurMapper',
-    'VideoRemoveWatermarkMapper', 'VideoResizeAspectRatioMapper',
-    'VideoResizeResolutionMapper', 'VideoSplitByDurationMapper',
-    'VideoSplitByKeyFrameMapper', 'VideoSplitBySceneMapper',
-    'VideoTaggingFromAudioMapper', 'VideoTaggingFromFramesMapper',
-    'WhitespaceNormalizationMapper'
+    'PythonLambdaMapper', 'RemoveBibliographyMapper', 'RemoveCommentsMapper',
+    'RemoveHeaderMapper', 'RemoveLongWordsMapper',
+    'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
+    'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
+    'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
+    'SentenceSplitMapper', 'TextChunkMapper', 'VideoCaptioningFromAudioMapper',
+    'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
+    'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
+    'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
+    'VideoResizeAspectRatioMapper', 'VideoResizeResolutionMapper',
+    'VideoSplitByDurationMapper', 'VideoSplitByKeyFrameMapper',
+    'VideoSplitBySceneMapper', 'VideoTaggingFromAudioMapper',
+    'VideoTaggingFromFramesMapper', 'WhitespaceNormalizationMapper'
 ]
diff --git a/data_juicer/ops/mapper/python_lambda_mapper.py b/data_juicer/ops/mapper/python_lambda_mapper.py
@@ -0,0 +1,74 @@
+import ast
+
+from ..base_op import OPERATORS, Mapper
+
+OP_NAME = 'python_lambda_mapper'
+
+
+@OPERATORS.register_module(OP_NAME)
+class PythonLambdaMapper(Mapper):
+    """Mapper for executing Python lambda function on data samples."""
+
+    def __init__(self, lambda_str: str = '', batched: bool = False, **kwargs):
+        """
+        Initialization method.
+
+        :param lambda_str: A string representation of the lambda function to be
+            executed on data samples. If empty, the identity function is used.
+        :param batched: A boolean indicating whether to process input data in
+            batches.
+        :param kwargs: Additional keyword arguments passed to the parent class.
+        """
+        self._batched_op = bool(batched)
+        super().__init__(**kwargs)
+
+        # Parse and validate the lambda function
+        if not lambda_str:
+            self.lambda_func = lambda sample: sample
+        else:
+            self.lambda_func = self._create_lambda(lambda_str)
+
+    def _create_lambda(self, lambda_str: str):
+        # Parse input string into an AST and check for a valid lambda function
+        try:
+            node = ast.parse(lambda_str, mode='eval')
+
+            # Check if the body of the expression is a lambda
+            if not isinstance(node.body, ast.Lambda):
+                raise ValueError(
+                    'Input string must be a valid lambda function.')
+
+            # Check that the lambda has exactly one argument
+            if len(node.body.args.args) != 1:
+                raise ValueError(
+                    'Lambda function must have exactly one argument.')
+
+            # Compile the AST to code
+            compiled_code = compile(node, '<string>', 'eval')
+            # Safely evaluate the compiled code allowing built-in functions
+            func = eval(compiled_code, {'__builtins__': __builtins__})
+            return func
+        except Exception as e:
+            raise ValueError(f'Invalid lambda function: {e}')
+
+    def process_single(self, sample):
+        # Process the input through the lambda function and return the result
+        result = self.lambda_func(sample)
+
+        # Check if the result is a valid
+        if not isinstance(result, dict):
+            raise ValueError(f'Lambda function must return a dictionary, '
+                             f'got {type(result).__name__} instead.')
+
+        return result
+
+    def process_batched(self, samples):
+        # Process the input through the lambda function and return the result
+        result = self.lambda_func(samples)
+
+        # Check if the result is a valid
+        if not isinstance(result, dict):
+            raise ValueError(f'Lambda function must return a dictionary, '
+                             f'got {type(result).__name__} instead.')
+
+        return result
diff --git a/docs/Operators.md b/docs/Operators.md
@@ -11,7 +11,7 @@ The operators in Data-Juicer are categorized into 5 types.
 | Type                              | Number | Description                                     |
 |-----------------------------------|:------:|-------------------------------------------------|
 | [ Formatter ]( #formatter )       |   9    | Discovers, loads, and canonicalizes source data |
-| [ Mapper ]( #mapper )             |   59   | Edits and transforms samples                    |
+| [ Mapper ]( #mapper )             |   60   | Edits and transforms samples                    |
 | [ Filter ]( #filter )             |   44   | Filters out low-quality samples                 |
 | [ Deduplicator ]( #deduplicator ) |   8    | Detects and removes duplicate samples           |
 | [ Selector ]( #selector )         |   4    | Selects top samples based on ranking            |
@@ -88,6 +88,7 @@ All the specific operators are listed below, each featured with several capabili
 | optimize_response_mapper                       | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)   | Optimize the response in question-answering samples.                                                                                                                           | [code](../data_juicer/ops/mapper/optimize_response_mapper.py)                      | [tests](../tests/ops/mapper/test_optimize_response_mapper.py)                      |
 | pair_preference_mapper                         | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | Construct paired preference samples.                                                                                                                                           | [code](../data_juicer/ops/mapper/pair_preference_mapper.py)                        | [tests](../tests/ops/mapper/test_pair_preference_mapper.py)                        |
 | punctuation_normalization_mapper               | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | Normalizes various Unicode punctuations to their ASCII equivalents                                                                                                             | [code](../data_juicer/ops/mapper/punctuation_normalization_mapper.py)              | [tests](../tests/ops/mapper/test_punctuation_normalization_mapper.py)              |
+| python_lambda_mapper                           | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | Executing Python lambda function on data samples                                                                                                                               | [code](../data_juicer/ops/mapper/python_lambda_mapper.py)                          | [tests](../tests/ops/mapper/test_python_lambda_mapper.py)                          |
 | remove_bibliography_mapper                     | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | Removes the bibliography of TeX documents                                                                                                                                      | [code](../data_juicer/ops/mapper/remove_bibliography_mapper.py)                    | [tests](../tests/ops/mapper/test_remove_bibliography_mapper.py)                    |
 | remove_comments_mapper                         | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | Removes the comments of TeX documents                                                                                                                                          | [code](../data_juicer/ops/mapper/remove_comments_mapper.py)                        | [tests](../tests/ops/mapper/test_remove_comments_mapper.py)                        |
 | remove_header_mapper                           | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | Removes the running headers of TeX documents, e.g., titles, chapter or section numbers/names                                                                                   | [code](../data_juicer/ops/mapper/remove_header_mapper.py)                          | [tests](../tests/ops/mapper/test_remove_header_mapper.py)                          |

diff --git a/docs/Operators_ZH.md b/docs/Operators_ZH.md
@@ -11,7 +11,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | 类型                                | 数量 | 描述            |
 |------------------------------------|:--:|---------------|
 | [ Formatter ]( #formatter )        |  9 | 发现、加载、规范化原始数据 |
-| [ Mapper ]( #mapper )              | 59 | 对数据样本进行编辑和转换  |
+| [ Mapper ]( #mapper )              | 60 | 对数据样本进行编辑和转换  |
 | [ Filter ]( #filter )              | 44 | 过滤低质量样本       |
 | [ Deduplicator ]( #deduplicator )  |  8 | 识别、删除重复样本     |
 | [ Selector ]( #selector )          |  4 | 基于排序选取高质量样本   |
@@ -87,6 +87,7 @@ Data-Juicer 中的算子分为以下 5 种类型。
 | optimize_response_mapper                       | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic) ![GPU](https://img.shields.io/badge/GPU-F27649?style=plastic)   | 指令优化，优化 response                                                                                              | [code](../data_juicer/ops/mapper/optimize_response_mapper.py)                      | [tests](../tests/ops/mapper/test_optimize_response_mapper.py)                      |
 | pair_preference_mapper                         | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | 构造配对的偏好样本                                                                                                   | [code](../data_juicer/ops/mapper/pair_preference_mapper.py)                        | [tests](../tests/ops/mapper/test_pair_preference_mapper.py)                        |
 | punctuation_normalization_mapper               | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | 将各种 Unicode 标点符号标准化为其 ASCII 等效项                                                                       | [code](../data_juicer/ops/mapper/punctuation_normalization_mapper.py)              | [tests](../tests/ops/mapper/test_punctuation_normalization_mapper.py)              |
+| python_lambda_mapper                           | ![General](https://img.shields.io/badge/General-5FBF50?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                 | 执行 Python lambda 函数处理样本                                                                                  | [code](../data_juicer/ops/mapper/python_lambda_mapper.py)                          | [tests](../tests/ops/mapper/test_python_lambda_mapper.py)                          |
 | remove_bibliography_mapper                     | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 删除 TeX 文档的参考文献                                                                                              | [code](../data_juicer/ops/mapper/remove_bibliography_mapper.py)                    | [tests](../tests/ops/mapper/test_remove_bibliography_mapper.py)                    |
 | remove_comments_mapper                         | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 删除 TeX 文档中的注释                                                                                                | [code](../data_juicer/ops/mapper/remove_comments_mapper.py)                        | [tests](../tests/ops/mapper/test_remove_comments_mapper.py)                        |
 | remove_header_mapper                           | ![LaTeX](https://img.shields.io/badge/LaTeX-D99379?style=plastic) ![Text](https://img.shields.io/badge/Text-010326?style=plastic) ![en](https://img.shields.io/badge/en-A60D1A?style=plastic) ![zh](https://img.shields.io/badge/zh-F2D6A2?style=plastic)                                                                     | 删除 TeX 文档头，例如标题、章节数字/名称等                                                                           | [code](../data_juicer/ops/mapper/remove_header_mapper.py)                          | [tests](../tests/ops/mapper/test_remove_header_mapper.py)                          |