diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index a3f5c17e4..901e8523e 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,4 +1,5 @@ import copy +import os import traceback from functools import wraps @@ -6,6 +7,7 @@ from loguru import logger from data_juicer import is_cuda_available +from data_juicer.utils.auto_install_utils import AutoInstaller from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import size_to_bytes from data_juicer.utils.process_utils import calculate_np @@ -13,6 +15,10 @@ OPERATORS = Registry('Operators') UNFORKABLE = Registry('Unforkable') +current_path = os.path.dirname(os.path.realpath(__file__)) +version_file_path = os.path.join(current_path, + '../../environments/science_requires.txt') +AUTOINSTALL = AutoInstaller([version_file_path]) def convert_list_dict_to_dict_list(samples): diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index 6fa47c869..54b2edc4f 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -7,6 +7,7 @@ from collections import defaultdict from typing import Optional +import lazy_loader as lazy import numpy as np import regex from loguru import logger @@ -14,17 +15,15 @@ from tqdm import tqdm from typing_extensions import Annotated -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.model_utils import prepare_sentencepiece_model -from ..base_op import OPERATORS, Deduplicator +from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator from ..common.helper_func import UnionFind, split_on_whitespace OP_NAME = 'document_minhash_deduplicator' -with AvailabilityChecking(['scipy'], OP_NAME): - from scipy.integrate import quad as integrate +integrate = lazy.load('scipy.integrate') MERSENNE_PRIME = np.uint64((1 << 61) - 1) MAX_HASH = np.uint64((1 << 32) - 1) @@ -70,7 +69,7 @@ def false_positive_probability(th: float, band: int, rows: int): def proba(s): return 1 - (1 - s**float(rows))**float(band) - a, _ = integrate(proba, 0.0, th) + a, _ = integrate.quad(proba, 0.0, th) return a def false_negative_probability(th: float, band: int, rows: int): @@ -79,7 +78,7 @@ def false_negative_probability(th: float, band: int, rows: int): def proba(s): return 1 - (1 - (1 - s**float(rows))**float(band)) - a, _ = integrate(proba, th, 1.0) + a, _ = integrate.quad(proba, th, 1.0) return a # object: minimize the weighted FP and FN ratio @@ -152,6 +151,7 @@ def __init__( sentencepiece tokenization. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['scipy']) # about minhash computation self.tokenization = tokenization self.window_size = window_size diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index b536bca95..e5f994682 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -5,21 +5,20 @@ from collections import defaultdict, deque from typing import Dict, Optional, Set +import lazy_loader as lazy import numpy as np import regex from loguru import logger from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys -from ..base_op import OPERATORS, Deduplicator +from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator from ..common.helper_func import split_on_whitespace OP_NAME = 'document_simhash_deduplicator' -with AvailabilityChecking(['simhash-pybind'], OP_NAME): - import simhash +simhash = lazy.load('simhash') @OPERATORS.register_module(OP_NAME) @@ -57,6 +56,7 @@ def __init__(self, """ # about simhash computation super().__init__(*args, **kwargs) + AUTOINSTALL.check(['simhash-pybind']) self.tokenization = tokenization self.window_size = window_size self.lowercase = lowercase diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index 828fab87f..5d6e8b3ba 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -1,34 +1,29 @@ from collections import defaultdict from typing import Dict, Set, Tuple +import lazy_loader as lazy import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.mm_utils import load_data_with_context, load_image -from ..base_op import OPERATORS, Deduplicator +from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator from ..op_fusion import LOADED_IMAGES from .document_deduplicator import DocumentDeduplicator OP_NAME = 'image_deduplicator' -with AvailabilityChecking(['imagededup'], OP_NAME): - import imagededup # noqa: F401 +imagededup = lazy.load('imagededup') - HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} +HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} - def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash - mapping = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } +def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash - return mapping[method_name] + mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash} + + return mapping[method_name] @OPERATORS.register_module(OP_NAME) @@ -54,6 +49,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['imagededup']) if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' f'Can only be one of {HASH_METHOD}.') diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index 038af481f..8444a1f94 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -1,31 +1,26 @@ +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.mm_utils import load_data_with_context, load_image -from ..base_op import OPERATORS +from ..base_op import AUTOINSTALL, OPERATORS from ..op_fusion import LOADED_IMAGES from .ray_basic_deduplicator import RayBasicDeduplicator OP_NAME = 'ray_image_deduplicator' -with AvailabilityChecking(['imagededup'], OP_NAME): - import imagededup # noqa: F401 +imagededup = lazy.load('imagededup') - HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} +HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} - def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash - mapping = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } +def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash - return mapping[method_name] + mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash} + + return mapping[method_name] @OPERATORS.register_module(OP_NAME) @@ -53,6 +48,7 @@ def __init__(self, redis_port=redis_port, *args, **kwargs) + AUTOINSTALL.check(['imagededup']) if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' f'Can only be one of {HASH_METHOD}.') diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py index 4e4112453..e6ea7cc7e 100644 --- a/data_juicer/ops/filter/alphanumeric_filter.py +++ b/data_juicer/ops/filter/alphanumeric_filter.py @@ -1,17 +1,13 @@ import sys -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import get_words_from_document OP_NAME = 'alphanumeric_filter' -with AvailabilityChecking(['transformers'], OP_NAME): - import transformers # noqa: F401 - @OPERATORS.register_module('alphanumeric_filter') class AlphanumericFilter(Filter): @@ -43,6 +39,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['transformers']) self.tokenization = tokenization self.min_ratio = min_ratio self.max_ratio = max_ratio diff --git a/data_juicer/ops/filter/flagged_words_filter.py b/data_juicer/ops/filter/flagged_words_filter.py index 84aa96036..2966313fc 100644 --- a/data_juicer/ops/filter/flagged_words_filter.py +++ b/data_juicer/ops/filter/flagged_words_filter.py @@ -6,21 +6,17 @@ from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model from ...utils.asset_utils import ASSET_DIR, load_words_asset -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'flagged_words_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) @@ -58,6 +54,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.lang = lang self.max_ratio = max_ratio self.use_words_aug = use_words_aug diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py index bc6a2df19..8924aee8d 100644 --- a/data_juicer/ops/filter/image_aesthetics_filter.py +++ b/data_juicer/ops/filter/image_aesthetics_filter.py @@ -1,25 +1,18 @@ +import lazy_loader as lazy import numpy as np from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import load_data_with_context, load_image from ...utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_aesthetics_filter' CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor'] -with AvailabilityChecking(CHECK_PKGs, OP_NAME): - - import aesthetics_predictor # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') @OPERATORS.register_module(OP_NAME) @@ -56,6 +49,8 @@ def __init__(self, """ super().__init__(*args, **kwargs) + AUTOINSTALL.check( + ['torch', 'transformers', 'simple-aesthetics-predictor']) if hf_scorer_model == '': hf_scorer_model = \ 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py index 2b5d06677..76071f602 100644 --- a/data_juicer/ops/filter/image_face_ratio_filter.py +++ b/data_juicer/ops/filter/image_face_ratio_filter.py @@ -1,21 +1,20 @@ import os +import lazy_loader as lazy import numpy as np from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context, load_image) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, UNFORKABLE, Filter +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_face_ratio_filter' -with AvailabilityChecking(['opencv-python'], OP_NAME): - import cv2 +cv2 = lazy.load('cv2') @UNFORKABLE.register_module(OP_NAME) @@ -54,6 +53,7 @@ def __init__(self, :param kwargs: Extra keyword arguments. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencv-python']) if cv_classifier == '': cv_classifier = os.path.join(cv2.data.haarcascades, diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py index 81f878b5f..50ac74a78 100644 --- a/data_juicer/ops/filter/image_nsfw_filter.py +++ b/data_juicer/ops/filter/image_nsfw_filter.py @@ -1,21 +1,17 @@ +import lazy_loader as lazy import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_nsfw_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling nsfw detection in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -47,6 +43,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.score_threshold = score_threshold if any_or_all not in ['any', 'all']: raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py index d5c6ad87c..dda7bd153 100644 --- a/data_juicer/ops/filter/image_text_matching_filter.py +++ b/data_juicer/ops/filter/image_text_matching_filter.py @@ -1,23 +1,19 @@ +import lazy_loader as lazy import numpy as np from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_text_matching_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling blip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -61,6 +57,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py index f6d2a0658..ca74441ca 100644 --- a/data_juicer/ops/filter/image_text_similarity_filter.py +++ b/data_juicer/ops/filter/image_text_similarity_filter.py @@ -1,24 +1,19 @@ +import lazy_loader as lazy import numpy as np from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_text_similarity_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -62,6 +57,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py index 620e80a09..4369dcafe 100644 --- a/data_juicer/ops/filter/image_watermark_filter.py +++ b/data_juicer/ops/filter/image_watermark_filter.py @@ -1,21 +1,17 @@ +import lazy_loader as lazy import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_watermark_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling watermark detection in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -51,6 +47,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.prob_threshold = prob_threshold if any_or_all not in ['any', 'all']: raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index 69283cf8a..9da08f6a5 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -1,17 +1,16 @@ from typing import List, Union +import lazy_loader as lazy from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter OP_NAME = 'language_id_score_filter' -with AvailabilityChecking(['fasttext-wheel'], OP_NAME): - import fasttext # noqa: F401 +fasttext = lazy.load('fasttext') @OPERATORS.register_module(OP_NAME) @@ -34,6 +33,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['fasttext', 'fasttext-wheel']) if not lang: # lang is [], '' or None self.lang = None diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py index 9b532d7c6..ab031157b 100644 --- a/data_juicer/ops/filter/perplexity_filter.py +++ b/data_juicer/ops/filter/perplexity_filter.py @@ -2,19 +2,19 @@ # https://huggingface.co/spaces/huggingface/text-data-filtering # -------------------------------------------------------- -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy + from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import get_words_from_document from ..op_fusion import INTER_WORDS OP_NAME = 'perplexity_filter' -with AvailabilityChecking(['sentencepiece', 'kenlm'], OP_NAME): - import kenlm # noqa: F401 - import sentencepiece # noqa: F401 +kenlm = lazy.load('kenlm') +sentencepiece = lazy.load('sentencepiece') @OPERATORS.register_module(OP_NAME) @@ -40,6 +40,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece', 'kenlm']) self.max_ppl = max_ppl self.lang = lang self.sp_model_key = prepare_model(model_type='sentencepiece', diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py index ad7afe902..9a9ba65dd 100644 --- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py +++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py @@ -1,30 +1,24 @@ from typing import List +import lazy_loader as lazy import numpy as np from loguru import logger from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (SpecialTokens, iou, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'phrase_grounding_recall_filter' -with AvailabilityChecking(['torch', 'transformers', 'nltk'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - - import nltk +torch = lazy.load('torch') +transformers = lazy.load('transformers') +nltk = lazy.load('nltk') # NER algorithm adapted from GLIP starts @@ -122,6 +116,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers', 'nltk']) self.min_recall = min_recall self.max_recall = max_recall if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py index 57dd138d1..1d9f59b7b 100644 --- a/data_juicer/ops/filter/stopwords_filter.py +++ b/data_juicer/ops/filter/stopwords_filter.py @@ -4,22 +4,21 @@ from typing import List +import lazy_loader as lazy from pydantic import PositiveInt from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'stopwords_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 +sentencepiece = lazy.load('sentencepiece') @OPERATORS.register_module(OP_NAME) @@ -57,6 +56,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.lang = lang self.min_ratio = min_ratio self.use_words_aug = use_words_aug diff --git a/data_juicer/ops/filter/text_action_filter.py b/data_juicer/ops/filter/text_action_filter.py index 9e27217e7..44c67920d 100644 --- a/data_juicer/ops/filter/text_action_filter.py +++ b/data_juicer/ops/filter/text_action_filter.py @@ -2,7 +2,7 @@ from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter OP_NAME = 'text_action_filter' @@ -28,6 +28,7 @@ def __init__(self, parameter. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['spacy-pkuseg']) if lang not in ['en', 'zh']: raise ValueError( diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py index b425ac17b..6e4ec9f36 100644 --- a/data_juicer/ops/filter/text_entity_dependency_filter.py +++ b/data_juicer/ops/filter/text_entity_dependency_filter.py @@ -4,7 +4,7 @@ from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter OP_NAME = 'text_entity_dependency_filter' @@ -35,6 +35,7 @@ def __init__(self, sample only if all images are dependent. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['spacy-pkuseg']) if lang not in ['en', 'zh']: raise ValueError( diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py index d3a31c338..de3349315 100644 --- a/data_juicer/ops/filter/token_num_filter.py +++ b/data_juicer/ops/filter/token_num_filter.py @@ -1,16 +1,16 @@ import sys -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy + from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import get_words_from_document OP_NAME = 'token_num_filter' -with AvailabilityChecking(['transformers'], OP_NAME): - import transformers # noqa: F401 +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -38,6 +38,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['transformers']) self.min_num = min_num self.max_num = max_num self.hf_tokenizer = hf_tokenizer diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 69129b60d..31c242473 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -1,28 +1,20 @@ +import lazy_loader as lazy import numpy as np from loguru import logger from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from ...utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_aesthetics_filter' -CHECK_PKGS = ['torch', 'transformers', 'simple-aesthetics-predictor'] -with AvailabilityChecking(CHECK_PKGS, OP_NAME): - - import aesthetics_predictor # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') @OPERATORS.register_module(OP_NAME) @@ -83,6 +75,8 @@ def __init__(self, """ super().__init__(*args, **kwargs) + AUTOINSTALL.check( + ['torch', 'transformers', 'simple-aesthetics-predictor']) if hf_scorer_model == '': hf_scorer_model = \ 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index eae51f66a..ddcbff1e7 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -1,8 +1,8 @@ +import lazy_loader as lazy import numpy as np from PIL import ImageOps from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, @@ -11,18 +11,13 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_frames_text_similarity_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -84,6 +79,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if frame_sampling_method not in ['all_keyframes', 'uniform']: diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py index daf94f273..e8e63f052 100644 --- a/data_juicer/ops/filter/video_motion_score_filter.py +++ b/data_juicer/ops/filter/video_motion_score_filter.py @@ -2,18 +2,17 @@ from contextlib import contextmanager from typing import Optional, Tuple, Union +import lazy_loader as lazy import numpy as np from pydantic import PositiveFloat, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from ..base_op import OPERATORS, UNFORKABLE, Filter +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter OP_NAME = 'video_motion_score_filter' -with AvailabilityChecking(['opencv-python'], OP_NAME): - import cv2 +cv2 = lazy.load('cv2') @contextmanager @@ -80,6 +79,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencv-python']) self.min_score = min_score self.max_score = max_score self.sampling_fps = sampling_fps diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index 8ce40c045..a96151f3e 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -1,25 +1,20 @@ +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_nsfw_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling nsfw detection in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -72,6 +67,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.score_threshold = score_threshold if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( diff --git a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py index c0a3f1c65..a36214fbc 100644 --- a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py +++ b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py @@ -1,22 +1,21 @@ from typing import List, Union +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt from data_juicer import cuda_device_count -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, extract_video_frames_uniformly, load_data_with_context, load_video) -from ..base_op import OPERATORS, UNFORKABLE, Filter +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_ocr_area_ratio_filter' -with AvailabilityChecking(['easyocr'], OP_NAME): - import easyocr +easyocr = lazy.load('easyocr') def triangle_area(p1, p2, p3): @@ -73,6 +72,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['easyocr']) self.min_area_ratio = min_area_ratio self.max_area_ratio = max_area_ratio self.frame_sample_num = frame_sample_num diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py index 056233a9c..f85cfaa54 100644 --- a/data_juicer/ops/filter/video_tagging_from_frames_filter.py +++ b/data_juicer/ops/filter/video_tagging_from_frames_filter.py @@ -3,25 +3,15 @@ import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields -from ..base_op import OPERATORS, UNFORKABLE, Filter +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter from ..mapper.video_tagging_from_frames_mapper import \ VideoTaggingFromFramesMapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_tagging_from_frames_filter' -with AvailabilityChecking( - ['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'], - OP_NAME): - import ram # noqa: F401 - import torch - - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) - @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @@ -72,6 +62,10 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check([ + 'torch', + 'ram@git+https://github.com/xinyu1205/recognize-anything.git' + ]) if contain not in ['any', 'all']: raise ValueError(f'the containing type [{contain}] is not ' f'supported. Can only be one of ["any", "all"].') diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 45f2d11d5..c5ddfc8b7 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -1,25 +1,19 @@ +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_watermark_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling watermark detection in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') @OPERATORS.register_module(OP_NAME) @@ -76,6 +70,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.prob_threshold = prob_threshold if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py index 3e9cad251..41a081694 100644 --- a/data_juicer/ops/filter/word_repetition_filter.py +++ b/data_juicer/ops/filter/word_repetition_filter.py @@ -4,20 +4,16 @@ from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'word_repetition_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) @@ -51,6 +47,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.n = rep_len self.min_ratio = min_ratio self.max_ratio = max_ratio diff --git a/data_juicer/ops/filter/words_num_filter.py b/data_juicer/ops/filter/words_num_filter.py index 978c252ad..413a2171d 100644 --- a/data_juicer/ops/filter/words_num_filter.py +++ b/data_juicer/ops/filter/words_num_filter.py @@ -1,19 +1,15 @@ import sys -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'words_num_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) @@ -45,6 +41,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.min_num = min_num self.max_num = max_num self.model_key = None diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py index 0c5341662..b6434c0f4 100644 --- a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py @@ -1,16 +1,17 @@ from typing import Dict, List, Optional -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy + from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'audio_ffmpeg_wrapped_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +with HiddenPrints(): + ffmpeg = lazy.load('ffmpeg') @OPERATORS.register_module(OP_NAME) @@ -40,6 +41,7 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) self.filter_name = filter_name diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 8e6bb9dc1..e18fa0afc 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,11 +1,10 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'chinese_convert_mapper' -with AvailabilityChecking(['opencc'], OP_NAME): - import opencc # noqa: F401 +opencc = lazy.load('opencc') OPENCC_CONVERTER = None @@ -75,6 +74,7 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencc']) mode_list = [ 's2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t' diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py index 09e847dd0..477c46846 100644 --- a/data_juicer/ops/mapper/clean_html_mapper.py +++ b/data_juicer/ops/mapper/clean_html_mapper.py @@ -2,14 +2,13 @@ # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/ # -------------------------------------------------------- -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'clean_html_mapper' -with AvailabilityChecking(['selectolax'], OP_NAME): - from selectolax.parser import HTMLParser +selectolax = lazy.load('selectolax') @OPERATORS.register_module(OP_NAME) @@ -26,6 +25,7 @@ def __init__(self, *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['selectolax']) def process(self, samples): @@ -34,7 +34,7 @@ def _clean_html(raw_html): raw_html = raw_html.replace('', '') raw_html = raw_html.replace('