Minor modifications (#306)

* * download imagededup models when they are used instead of imported * set the minimum version limitation for torch * * update datasets library to 2.18.0 to support modelscope required by sandbox * split sandbox requirements to a single file/tag + add requirements description in sandbox docs * fix an import bug in sandbox_starter.py
modelscope · Apr 25, 2024 · f142c2e · f142c2e
1 parent eaf7746
commit f142c2e
Show file tree

Hide file tree

Showing 11 changed files with 86 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -175,14 +175,15 @@ pip install -v -e .[tools] # install a subset of tools dependencies
 
 The dependency options are listed below:
 
-| Tag          | Description                                                                                  |
-|--------------|----------------------------------------------------------------------------------------------|
+| Tag              | Description                                                                                  |
+|------------------|----------------------------------------------------------------------------------------------|
 | `.` or `.[mini]` | Install minimal dependencies for basic Data-Juicer.                                          |
-| `.[all]`       | Install all optional dependencies (including minimal dependencies and all of the following). |
-| `.[sci]`       | Install all dependencies for all OPs.                                                        |
-| `.[dist]`      | Install dependencies for distributed data processing. (Experimental)                         |
-| `.[dev]`       | Install dependencies for developing the package as contributors.                             |
-| `.[tools]`     | Install dependencies for dedicated tools, such as quality classifiers.                       |
+| `.[all]`         | Install all optional dependencies (including minimal dependencies and all of the following). |
+| `.[sci]`         | Install all dependencies for all OPs.                                                        |
+| `.[sandbox]`     | Install all dependencies for sandbox.                                                        |
+| `.[dist]`        | Install dependencies for distributed data processing. (Experimental)                         |
+| `.[dev]`         | Install dependencies for developing the package as contributors.                             |
+| `.[tools]`       | Install dependencies for dedicated tools, such as quality classifiers.                       |
 
 ### Using pip
 

diff --git a/README_ZH.md b/README_ZH.md
@@ -158,14 +158,15 @@ pip install -v -e .[tools] # 安装部分工具库的依赖
 
 依赖选项如下表所示:
 
-| 标签           | 描述                           |
-|--------------|------------------------------|
+| 标签               | 描述                           |
+|------------------|------------------------------|
 | `.` 或者 `.[mini]` | 安装支持 Data-Juicer 基础功能的最小依赖项  |
-| `.[all]`       | 安装所有可选依赖项（包括最小依赖项以及下面所有依赖项）  |
-| `.[sci]`       | 安装所有算子的全量依赖                  |
-| `.[dist]`      | 安装以分布式方式进行数据处理的依赖（实验性功能）     |
-| `.[dev]`       | 安装作为贡献者开发 Data-Juicer 所需的依赖项 |
-| `.[tools]`     | 安装专用工具库（如质量分类器）所需的依赖项        |
+| `.[all]`         | 安装所有可选依赖项（包括最小依赖项以及下面所有依赖项）  |
+| `.[sci]`         | 安装所有算子的全量依赖                  |
+| `.[sandbox]`     | 安装沙盒实验室的基础依赖                 |
+| `.[dist]`        | 安装以分布式方式进行数据处理的依赖（实验性功能）     |
+| `.[dev]`         | 安装作为贡献者开发 Data-Juicer 所需的依赖项 |
+| `.[tools]`       | 安装专用工具库（如质量分类器）所需的依赖项        |
 
 ### 使用 pip 安装
 

diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py
@@ -13,14 +13,21 @@
 OP_NAME = 'image_deduplicator'
 
 with AvailabilityChecking(['imagededup'], OP_NAME):
-    from imagededup.methods import AHash, DHash, PHash, WHash
+    import imagededup  # noqa: F401
 
-    HASH_METHOD = {
-        'phash': PHash,
-        'dhash': DHash,
-        'whash': WHash,
-        'ahash': AHash
-    }
+    HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
+
+    def get_hash_method(method_name):
+        from imagededup.methods import AHash, DHash, PHash, WHash
+
+        mapping = {
+            'phash': PHash,
+            'dhash': DHash,
+            'whash': WHash,
+            'ahash': AHash
+        }
+
+        return mapping[method_name]
 
 
 @OPERATORS.register_module(OP_NAME)
@@ -40,10 +47,10 @@ def __init__(self, method: str = 'phash', *args, **kwargs):
         :param kwargs: extra args
         """
         super().__init__(*args, **kwargs)
-        if method not in HASH_METHOD.keys():
+        if method not in HASH_METHOD:
             raise ValueError(f'Keep strategy [{method}] is not supported. '
-                             f'Can only be one of {HASH_METHOD.keys()}.')
-        self.hasher = HASH_METHOD[method]()
+                             f'Can only be one of {HASH_METHOD}.')
+        self.hasher = get_hash_method(method)()
 
     def compute_hash(self, sample, context=False):
         # check if it's computed already

diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py
@@ -11,14 +11,21 @@
 OP_NAME = 'ray_image_deduplicator'
 
 with AvailabilityChecking(['imagededup'], OP_NAME):
-    from imagededup.methods import AHash, DHash, PHash, WHash
+    import imagededup  # noqa: F401
 
-    HASH_METHOD = {
-        'phash': PHash,
-        'dhash': DHash,
-        'whash': WHash,
-        'ahash': AHash
-    }
+    HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
+
+    def get_hash_method(method_name):
+        from imagededup.methods import AHash, DHash, PHash, WHash
+
+        mapping = {
+            'phash': PHash,
+            'dhash': DHash,
+            'whash': WHash,
+            'ahash': AHash
+        }
+
+        return mapping[method_name]
 
 
 @OPERATORS.register_module(OP_NAME)
@@ -46,10 +53,10 @@ def __init__(self,
                          redis_port=redis_port,
                          *args,
                          **kwargs)
-        if method not in HASH_METHOD.keys():
+        if method not in HASH_METHOD:
             raise ValueError(f'Keep strategy [{method}] is not supported. '
-                             f'Can only be one of {HASH_METHOD.keys()}.')
-        self.hasher = HASH_METHOD[method]()
+                             f'Can only be one of {HASH_METHOD}.')
+        self.hasher = get_hash_method(method)()
 
     def calculate_hash(self, sample, context=False):
         if self.image_key not in sample or not sample[self.image_key]:

diff --git a/docs/Sandbox-ZH.md b/docs/Sandbox-ZH.md
@@ -4,6 +4,19 @@
 
 用户在沙盒中，除了Data-Juicer基础的数据优化与数据菜谱微调功能外，还可以便捷地使用数据洞察与分析、沙盒模型训练与评测、基于数据和模型反馈优化数据菜谱等可配置组件，共同组成完整的一站式数据-模型研发流水线。
 ## 快速上手
+### 依赖准备
+在使用沙盒实验室前，你可能需要使用如下命令安装沙盒相关的第三方依赖：
+```shell
+pip install -v -e .[sandbox]
+
+# 或者直接安装全量依赖
+pip install -v -e .[all]
+```
+
+**注意**：一些沙盒的依赖还需要额外的领域依赖。例如，如果用户想要在沙盒中训练一个 ModelScope 平台的NLP模型，那可能需要为 `modelscope` 库
+安装额外的 `nlp` 领域依赖（参考其[安装文档](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85) ）。
+因此如果使用沙盒过程中，这些第三方依赖抛出了一些"未找到模块（Module-Not-Found）"的报错时，用户需要先检查这些库的文档以寻求帮助。
+
 ### 准备沙盒配置文件
 沙盒的主配置文件除了Data-Juicer的配置文件外，还包括了若干额外的参数用于指定沙盒流水线中可能会运行的模型训练、推理、评测等步骤的配置信息，完整的额外参数可参考 [config_all.yaml](https://github.com/modelscope/data-juicer/blob/main/configs/config_all.yaml) 中的“for sandbox or hpo”部分参数。一个sandbox的配置文件示例可参考`configs/demo/sandbox/sandbox.yaml`：
 ```yaml

diff --git a/docs/Sandbox.md b/docs/Sandbox.md
@@ -4,6 +4,19 @@ In Data-Juicer, the data sandbox laboratory provides users with the best practic
 
 In addition to the basic data optimization and recipe refinement features offered by Data-Juicer, users can seamlessly use configurable components such as data probe and analysis, model training and evaluation, and data and model feedback-based recipe refinement to form a complete one-stop data-model research and development pipeline.
 ## Quick Start
+### Requirements
+Before using sandbox, you might need to install sandbox-related third-party dependencies by running the command below:
+```shell
+pip install -v -e .[sandbox]
+
+# or install all dependencies
+pip install -v -e .[all]
+```
+
+**NOTICE**: some sandbox-related dependencies require extra domain dependencies. For example, if users want to train an NLP model from ModelScope
+in the sandbox, you might need to install extra `nlp` dependencies for `modelscope` library (see the [installation docs](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)).
+So if some Module-Not-Found errors are raised by these third-party libraries when running the sandbox, users need to check their docs first.
+
 ### Prepare Configuration Files for Sandbox
 The configuration file of the sandbox includes several additional parameters in addition to the configuration of Data-Juicer. These parameters are used to specify the configuration information for model training, inference, evaluation, and other steps that may run in the sandbox pipeline. For the complete set of additional parameters, please refer to the "for sandbox or hpo" section in the [config_all.yaml](https://github.com/modelscope/data-juicer/blob/main/configs/config_all.yaml). An example of a sandbox configuration file can be found in `configs/demo/sandbox/sandbox.yaml`:
 ```yaml

diff --git a/environments/minimal_requires.txt b/environments/minimal_requires.txt
@@ -1,7 +1,7 @@
 fsspec==2023.5.0
 pyarrow<=12.0.0
 pandas==2.0.3
-datasets==2.11.0
+datasets==2.18.0
 av
 soundfile
 librosa

diff --git a/environments/sandbox_requires.txt b/environments/sandbox_requires.txt
@@ -0,0 +1,3 @@
+wandb
+# modelscope-related
+modelscope
diff --git a/environments/science_requires.txt b/environments/science_requires.txt
@@ -1,3 +1,5 @@
+torch>=1.11.0
+torchaudio
 easyocr
 fasttext-wheel
 kenlm
@@ -16,8 +18,6 @@ accelerate
 tiktoken
 opencc==1.1.6
 imagededup
-torch
-torchaudio
 dlib
 spacy-pkuseg==0.0.32
 diffusers

diff --git a/setup.py b/setup.py
@@ -41,6 +41,8 @@ def get_install_requirements(require_f_paths, env_dir='environments'):
     'tools':
     get_install_requirements(
         ['preprocess_requires.txt', 'quality_classifier_requires.txt']),
+    'sandbox':
+    get_install_requirements(['sandbox_requires.txt']),
 }
 extra_requires['all'] = [v for v in extra_requires.values()]
 

diff --git a/tools/sandbox_starter.py b/tools/sandbox_starter.py
@@ -1,9 +1,10 @@
 import json
 
 import yaml
+from jsonargparse import dict_to_namespace
 from loguru import logger
 
-from data_juicer.config import dict_to_namespace, init_configs
+from data_juicer.config import init_configs
 from data_juicer.core.sandbox.pipelines import SandBoxExecutor