diff --git a/data_juicer/ops/base_op.py b/data_juicer/ops/base_op.py index a3f5c17e4..901e8523e 100644 --- a/data_juicer/ops/base_op.py +++ b/data_juicer/ops/base_op.py @@ -1,4 +1,5 @@ import copy +import os import traceback from functools import wraps @@ -6,6 +7,7 @@ from loguru import logger from data_juicer import is_cuda_available +from data_juicer.utils.auto_install_utils import AutoInstaller from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import size_to_bytes from data_juicer.utils.process_utils import calculate_np @@ -13,6 +15,10 @@ OPERATORS = Registry('Operators') UNFORKABLE = Registry('Unforkable') +current_path = os.path.dirname(os.path.realpath(__file__)) +version_file_path = os.path.join(current_path, + '../../environments/science_requires.txt') +AUTOINSTALL = AutoInstaller([version_file_path]) def convert_list_dict_to_dict_list(samples): diff --git a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py index 6fa47c869..54b2edc4f 100644 --- a/data_juicer/ops/deduplicator/document_minhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_minhash_deduplicator.py @@ -7,6 +7,7 @@ from collections import defaultdict from typing import Optional +import lazy_loader as lazy import numpy as np import regex from loguru import logger @@ -14,17 +15,15 @@ from tqdm import tqdm from typing_extensions import Annotated -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.model_utils import prepare_sentencepiece_model -from ..base_op import OPERATORS, Deduplicator +from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator from ..common.helper_func import UnionFind, split_on_whitespace OP_NAME = 'document_minhash_deduplicator' -with AvailabilityChecking(['scipy'], OP_NAME): - from scipy.integrate import quad as integrate +integrate = lazy.load('scipy.integrate') MERSENNE_PRIME = np.uint64((1 << 61) - 1) MAX_HASH = np.uint64((1 << 32) - 1) @@ -70,7 +69,7 @@ def false_positive_probability(th: float, band: int, rows: int): def proba(s): return 1 - (1 - s**float(rows))**float(band) - a, _ = integrate(proba, 0.0, th) + a, _ = integrate.quad(proba, 0.0, th) return a def false_negative_probability(th: float, band: int, rows: int): @@ -79,7 +78,7 @@ def false_negative_probability(th: float, band: int, rows: int): def proba(s): return 1 - (1 - (1 - s**float(rows))**float(band)) - a, _ = integrate(proba, th, 1.0) + a, _ = integrate.quad(proba, th, 1.0) return a # object: minimize the weighted FP and FN ratio @@ -152,6 +151,7 @@ def __init__( sentencepiece tokenization. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['scipy']) # about minhash computation self.tokenization = tokenization self.window_size = window_size diff --git a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py index b536bca95..e5f994682 100644 --- a/data_juicer/ops/deduplicator/document_simhash_deduplicator.py +++ b/data_juicer/ops/deduplicator/document_simhash_deduplicator.py @@ -5,21 +5,20 @@ from collections import defaultdict, deque from typing import Dict, Optional, Set +import lazy_loader as lazy import numpy as np import regex from loguru import logger from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys -from ..base_op import OPERATORS, Deduplicator +from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator from ..common.helper_func import split_on_whitespace OP_NAME = 'document_simhash_deduplicator' -with AvailabilityChecking(['simhash-pybind'], OP_NAME): - import simhash +simhash = lazy.load('simhash') @OPERATORS.register_module(OP_NAME) @@ -57,6 +56,7 @@ def __init__(self, """ # about simhash computation super().__init__(*args, **kwargs) + AUTOINSTALL.check(['simhash-pybind']) self.tokenization = tokenization self.window_size = window_size self.lowercase = lowercase diff --git a/data_juicer/ops/deduplicator/image_deduplicator.py b/data_juicer/ops/deduplicator/image_deduplicator.py index 828fab87f..5d6e8b3ba 100644 --- a/data_juicer/ops/deduplicator/image_deduplicator.py +++ b/data_juicer/ops/deduplicator/image_deduplicator.py @@ -1,34 +1,29 @@ from collections import defaultdict from typing import Dict, Set, Tuple +import lazy_loader as lazy import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.mm_utils import load_data_with_context, load_image -from ..base_op import OPERATORS, Deduplicator +from ..base_op import AUTOINSTALL, OPERATORS, Deduplicator from ..op_fusion import LOADED_IMAGES from .document_deduplicator import DocumentDeduplicator OP_NAME = 'image_deduplicator' -with AvailabilityChecking(['imagededup'], OP_NAME): - import imagededup # noqa: F401 +imagededup = lazy.load('imagededup') - HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} +HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} - def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash - mapping = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } +def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash - return mapping[method_name] + mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash} + + return mapping[method_name] @OPERATORS.register_module(OP_NAME) @@ -54,6 +49,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['imagededup']) if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' f'Can only be one of {HASH_METHOD}.') diff --git a/data_juicer/ops/deduplicator/ray_image_deduplicator.py b/data_juicer/ops/deduplicator/ray_image_deduplicator.py index 038af481f..8444a1f94 100644 --- a/data_juicer/ops/deduplicator/ray_image_deduplicator.py +++ b/data_juicer/ops/deduplicator/ray_image_deduplicator.py @@ -1,31 +1,26 @@ +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.mm_utils import load_data_with_context, load_image -from ..base_op import OPERATORS +from ..base_op import AUTOINSTALL, OPERATORS from ..op_fusion import LOADED_IMAGES from .ray_basic_deduplicator import RayBasicDeduplicator OP_NAME = 'ray_image_deduplicator' -with AvailabilityChecking(['imagededup'], OP_NAME): - import imagededup # noqa: F401 +imagededup = lazy.load('imagededup') - HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} +HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'} - def get_hash_method(method_name): - from imagededup.methods import AHash, DHash, PHash, WHash - mapping = { - 'phash': PHash, - 'dhash': DHash, - 'whash': WHash, - 'ahash': AHash - } +def get_hash_method(method_name): + from imagededup.methods import AHash, DHash, PHash, WHash - return mapping[method_name] + mapping = {'phash': PHash, 'dhash': DHash, 'whash': WHash, 'ahash': AHash} + + return mapping[method_name] @OPERATORS.register_module(OP_NAME) @@ -53,6 +48,7 @@ def __init__(self, redis_port=redis_port, *args, **kwargs) + AUTOINSTALL.check(['imagededup']) if method not in HASH_METHOD: raise ValueError(f'Keep strategy [{method}] is not supported. ' f'Can only be one of {HASH_METHOD}.') diff --git a/data_juicer/ops/filter/alphanumeric_filter.py b/data_juicer/ops/filter/alphanumeric_filter.py index 4e4112453..e6ea7cc7e 100644 --- a/data_juicer/ops/filter/alphanumeric_filter.py +++ b/data_juicer/ops/filter/alphanumeric_filter.py @@ -1,17 +1,13 @@ import sys -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import get_words_from_document OP_NAME = 'alphanumeric_filter' -with AvailabilityChecking(['transformers'], OP_NAME): - import transformers # noqa: F401 - @OPERATORS.register_module('alphanumeric_filter') class AlphanumericFilter(Filter): @@ -43,6 +39,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['transformers']) self.tokenization = tokenization self.min_ratio = min_ratio self.max_ratio = max_ratio diff --git a/data_juicer/ops/filter/flagged_words_filter.py b/data_juicer/ops/filter/flagged_words_filter.py index 84aa96036..2966313fc 100644 --- a/data_juicer/ops/filter/flagged_words_filter.py +++ b/data_juicer/ops/filter/flagged_words_filter.py @@ -6,21 +6,17 @@ from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model from ...utils.asset_utils import ASSET_DIR, load_words_asset -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'flagged_words_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) @@ -58,6 +54,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.lang = lang self.max_ratio = max_ratio self.use_words_aug = use_words_aug diff --git a/data_juicer/ops/filter/image_aesthetics_filter.py b/data_juicer/ops/filter/image_aesthetics_filter.py index bc6a2df19..8924aee8d 100644 --- a/data_juicer/ops/filter/image_aesthetics_filter.py +++ b/data_juicer/ops/filter/image_aesthetics_filter.py @@ -1,25 +1,18 @@ +import lazy_loader as lazy import numpy as np from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import load_data_with_context, load_image from ...utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_aesthetics_filter' CHECK_PKGs = ['torch', 'transformers', 'simple-aesthetics-predictor'] -with AvailabilityChecking(CHECK_PKGs, OP_NAME): - - import aesthetics_predictor # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') @OPERATORS.register_module(OP_NAME) @@ -56,6 +49,8 @@ def __init__(self, """ super().__init__(*args, **kwargs) + AUTOINSTALL.check( + ['torch', 'transformers', 'simple-aesthetics-predictor']) if hf_scorer_model == '': hf_scorer_model = \ 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' diff --git a/data_juicer/ops/filter/image_face_ratio_filter.py b/data_juicer/ops/filter/image_face_ratio_filter.py index 2b5d06677..76071f602 100644 --- a/data_juicer/ops/filter/image_face_ratio_filter.py +++ b/data_juicer/ops/filter/image_face_ratio_filter.py @@ -1,21 +1,20 @@ import os +import lazy_loader as lazy import numpy as np from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context, load_image) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, UNFORKABLE, Filter +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_face_ratio_filter' -with AvailabilityChecking(['opencv-python'], OP_NAME): - import cv2 +cv2 = lazy.load('cv2') @UNFORKABLE.register_module(OP_NAME) @@ -54,6 +53,7 @@ def __init__(self, :param kwargs: Extra keyword arguments. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencv-python']) if cv_classifier == '': cv_classifier = os.path.join(cv2.data.haarcascades, diff --git a/data_juicer/ops/filter/image_nsfw_filter.py b/data_juicer/ops/filter/image_nsfw_filter.py index 81f878b5f..50ac74a78 100644 --- a/data_juicer/ops/filter/image_nsfw_filter.py +++ b/data_juicer/ops/filter/image_nsfw_filter.py @@ -1,21 +1,17 @@ +import lazy_loader as lazy import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_nsfw_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling nsfw detection in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -47,6 +43,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.score_threshold = score_threshold if any_or_all not in ['any', 'all']: raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' diff --git a/data_juicer/ops/filter/image_text_matching_filter.py b/data_juicer/ops/filter/image_text_matching_filter.py index d5c6ad87c..dda7bd153 100644 --- a/data_juicer/ops/filter/image_text_matching_filter.py +++ b/data_juicer/ops/filter/image_text_matching_filter.py @@ -1,23 +1,19 @@ +import lazy_loader as lazy import numpy as np from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_text_matching_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling blip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -61,6 +57,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/image_text_similarity_filter.py b/data_juicer/ops/filter/image_text_similarity_filter.py index f6d2a0658..ca74441ca 100644 --- a/data_juicer/ops/filter/image_text_similarity_filter.py +++ b/data_juicer/ops/filter/image_text_similarity_filter.py @@ -1,24 +1,19 @@ +import lazy_loader as lazy import numpy as np from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_text_similarity_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -62,6 +57,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/image_watermark_filter.py b/data_juicer/ops/filter/image_watermark_filter.py index 620e80a09..4369dcafe 100644 --- a/data_juicer/ops/filter/image_watermark_filter.py +++ b/data_juicer/ops/filter/image_watermark_filter.py @@ -1,21 +1,17 @@ +import lazy_loader as lazy import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import load_data_with_context, load_image from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_watermark_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - import torch - import transformers # noqa: F401 - - # avoid hanging when calling watermark detection in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -51,6 +47,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.prob_threshold = prob_threshold if any_or_all not in ['any', 'all']: raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' diff --git a/data_juicer/ops/filter/language_id_score_filter.py b/data_juicer/ops/filter/language_id_score_filter.py index 69283cf8a..9da08f6a5 100644 --- a/data_juicer/ops/filter/language_id_score_filter.py +++ b/data_juicer/ops/filter/language_id_score_filter.py @@ -1,17 +1,16 @@ from typing import List, Union +import lazy_loader as lazy from loguru import logger -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter OP_NAME = 'language_id_score_filter' -with AvailabilityChecking(['fasttext-wheel'], OP_NAME): - import fasttext # noqa: F401 +fasttext = lazy.load('fasttext') @OPERATORS.register_module(OP_NAME) @@ -34,6 +33,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['fasttext', 'fasttext-wheel']) if not lang: # lang is [], '' or None self.lang = None diff --git a/data_juicer/ops/filter/perplexity_filter.py b/data_juicer/ops/filter/perplexity_filter.py index 9b532d7c6..ab031157b 100644 --- a/data_juicer/ops/filter/perplexity_filter.py +++ b/data_juicer/ops/filter/perplexity_filter.py @@ -2,19 +2,19 @@ # https://huggingface.co/spaces/huggingface/text-data-filtering # -------------------------------------------------------- -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy + from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import get_words_from_document from ..op_fusion import INTER_WORDS OP_NAME = 'perplexity_filter' -with AvailabilityChecking(['sentencepiece', 'kenlm'], OP_NAME): - import kenlm # noqa: F401 - import sentencepiece # noqa: F401 +kenlm = lazy.load('kenlm') +sentencepiece = lazy.load('sentencepiece') @OPERATORS.register_module(OP_NAME) @@ -40,6 +40,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece', 'kenlm']) self.max_ppl = max_ppl self.lang = lang self.sp_model_key = prepare_model(model_type='sentencepiece', diff --git a/data_juicer/ops/filter/phrase_grounding_recall_filter.py b/data_juicer/ops/filter/phrase_grounding_recall_filter.py index ad7afe902..9a9ba65dd 100644 --- a/data_juicer/ops/filter/phrase_grounding_recall_filter.py +++ b/data_juicer/ops/filter/phrase_grounding_recall_filter.py @@ -1,30 +1,24 @@ from typing import List +import lazy_loader as lazy import numpy as np from loguru import logger from PIL import ImageOps -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (SpecialTokens, iou, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import LOADED_IMAGES OP_NAME = 'phrase_grounding_recall_filter' -with AvailabilityChecking(['torch', 'transformers', 'nltk'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - - import nltk +torch = lazy.load('torch') +transformers = lazy.load('transformers') +nltk = lazy.load('nltk') # NER algorithm adapted from GLIP starts @@ -122,6 +116,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers', 'nltk']) self.min_recall = min_recall self.max_recall = max_recall if reduce_mode not in ['avg', 'max', 'min']: diff --git a/data_juicer/ops/filter/stopwords_filter.py b/data_juicer/ops/filter/stopwords_filter.py index 57dd138d1..1d9f59b7b 100644 --- a/data_juicer/ops/filter/stopwords_filter.py +++ b/data_juicer/ops/filter/stopwords_filter.py @@ -4,22 +4,21 @@ from typing import List +import lazy_loader as lazy from pydantic import PositiveInt from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'stopwords_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 +sentencepiece = lazy.load('sentencepiece') @OPERATORS.register_module(OP_NAME) @@ -57,6 +56,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.lang = lang self.min_ratio = min_ratio self.use_words_aug = use_words_aug diff --git a/data_juicer/ops/filter/text_action_filter.py b/data_juicer/ops/filter/text_action_filter.py index 9e27217e7..44c67920d 100644 --- a/data_juicer/ops/filter/text_action_filter.py +++ b/data_juicer/ops/filter/text_action_filter.py @@ -2,7 +2,7 @@ from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter OP_NAME = 'text_action_filter' @@ -28,6 +28,7 @@ def __init__(self, parameter. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['spacy-pkuseg']) if lang not in ['en', 'zh']: raise ValueError( diff --git a/data_juicer/ops/filter/text_entity_dependency_filter.py b/data_juicer/ops/filter/text_entity_dependency_filter.py index b425ac17b..6e4ec9f36 100644 --- a/data_juicer/ops/filter/text_entity_dependency_filter.py +++ b/data_juicer/ops/filter/text_entity_dependency_filter.py @@ -4,7 +4,7 @@ from data_juicer.utils.mm_utils import remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter OP_NAME = 'text_entity_dependency_filter' @@ -35,6 +35,7 @@ def __init__(self, sample only if all images are dependent. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['spacy-pkuseg']) if lang not in ['en', 'zh']: raise ValueError( diff --git a/data_juicer/ops/filter/token_num_filter.py b/data_juicer/ops/filter/token_num_filter.py index d3a31c338..de3349315 100644 --- a/data_juicer/ops/filter/token_num_filter.py +++ b/data_juicer/ops/filter/token_num_filter.py @@ -1,16 +1,16 @@ import sys -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy + from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import get_words_from_document OP_NAME = 'token_num_filter' -with AvailabilityChecking(['transformers'], OP_NAME): - import transformers # noqa: F401 +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -38,6 +38,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['transformers']) self.min_num = min_num self.max_num = max_num self.hf_tokenizer = hf_tokenizer diff --git a/data_juicer/ops/filter/video_aesthetics_filter.py b/data_juicer/ops/filter/video_aesthetics_filter.py index 69129b60d..31c242473 100644 --- a/data_juicer/ops/filter/video_aesthetics_filter.py +++ b/data_juicer/ops/filter/video_aesthetics_filter.py @@ -1,28 +1,20 @@ +import lazy_loader as lazy import numpy as np from loguru import logger from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from ...utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_aesthetics_filter' -CHECK_PKGS = ['torch', 'transformers', 'simple-aesthetics-predictor'] -with AvailabilityChecking(CHECK_PKGS, OP_NAME): - - import aesthetics_predictor # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') @OPERATORS.register_module(OP_NAME) @@ -83,6 +75,8 @@ def __init__(self, """ super().__init__(*args, **kwargs) + AUTOINSTALL.check( + ['torch', 'transformers', 'simple-aesthetics-predictor']) if hf_scorer_model == '': hf_scorer_model = \ 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' diff --git a/data_juicer/ops/filter/video_frames_text_similarity_filter.py b/data_juicer/ops/filter/video_frames_text_similarity_filter.py index eae51f66a..ddcbff1e7 100644 --- a/data_juicer/ops/filter/video_frames_text_similarity_filter.py +++ b/data_juicer/ops/filter/video_frames_text_similarity_filter.py @@ -1,8 +1,8 @@ +import lazy_loader as lazy import numpy as np from PIL import ImageOps from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, @@ -11,18 +11,13 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_frames_text_similarity_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -84,6 +79,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.min_score = min_score self.max_score = max_score if frame_sampling_method not in ['all_keyframes', 'uniform']: diff --git a/data_juicer/ops/filter/video_motion_score_filter.py b/data_juicer/ops/filter/video_motion_score_filter.py index daf94f273..e8e63f052 100644 --- a/data_juicer/ops/filter/video_motion_score_filter.py +++ b/data_juicer/ops/filter/video_motion_score_filter.py @@ -2,18 +2,17 @@ from contextlib import contextmanager from typing import Optional, Tuple, Union +import lazy_loader as lazy import numpy as np from pydantic import PositiveFloat, PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from ..base_op import OPERATORS, UNFORKABLE, Filter +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter OP_NAME = 'video_motion_score_filter' -with AvailabilityChecking(['opencv-python'], OP_NAME): - import cv2 +cv2 = lazy.load('cv2') @contextmanager @@ -80,6 +79,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencv-python']) self.min_score = min_score self.max_score = max_score self.sampling_fps = sampling_fps diff --git a/data_juicer/ops/filter/video_nsfw_filter.py b/data_juicer/ops/filter/video_nsfw_filter.py index 8ce40c045..a96151f3e 100644 --- a/data_juicer/ops/filter/video_nsfw_filter.py +++ b/data_juicer/ops/filter/video_nsfw_filter.py @@ -1,25 +1,20 @@ +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_nsfw_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling nsfw detection in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') +transformers = lazy.load('transformers') @OPERATORS.register_module(OP_NAME) @@ -72,6 +67,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.score_threshold = score_threshold if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( diff --git a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py index c0a3f1c65..a36214fbc 100644 --- a/data_juicer/ops/filter/video_ocr_area_ratio_filter.py +++ b/data_juicer/ops/filter/video_ocr_area_ratio_filter.py @@ -1,22 +1,21 @@ from typing import List, Union +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt from data_juicer import cuda_device_count -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, extract_video_frames_uniformly, load_data_with_context, load_video) -from ..base_op import OPERATORS, UNFORKABLE, Filter +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_ocr_area_ratio_filter' -with AvailabilityChecking(['easyocr'], OP_NAME): - import easyocr +easyocr = lazy.load('easyocr') def triangle_area(p1, p2, p3): @@ -73,6 +72,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['easyocr']) self.min_area_ratio = min_area_ratio self.max_area_ratio = max_area_ratio self.frame_sample_num = frame_sample_num diff --git a/data_juicer/ops/filter/video_tagging_from_frames_filter.py b/data_juicer/ops/filter/video_tagging_from_frames_filter.py index 056233a9c..f85cfaa54 100644 --- a/data_juicer/ops/filter/video_tagging_from_frames_filter.py +++ b/data_juicer/ops/filter/video_tagging_from_frames_filter.py @@ -3,25 +3,15 @@ import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields -from ..base_op import OPERATORS, UNFORKABLE, Filter +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Filter from ..mapper.video_tagging_from_frames_mapper import \ VideoTaggingFromFramesMapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_tagging_from_frames_filter' -with AvailabilityChecking( - ['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'], - OP_NAME): - import ram # noqa: F401 - import torch - - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) - @UNFORKABLE.register_module(OP_NAME) @OPERATORS.register_module(OP_NAME) @@ -72,6 +62,10 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check([ + 'torch', + 'ram@git+https://github.com/xinyu1205/recognize-anything.git' + ]) if contain not in ['any', 'all']: raise ValueError(f'the containing type [{contain}] is not ' f'supported. Can only be one of ["any", "all"].') diff --git a/data_juicer/ops/filter/video_watermark_filter.py b/data_juicer/ops/filter/video_watermark_filter.py index 45f2d11d5..c5ddfc8b7 100644 --- a/data_juicer/ops/filter/video_watermark_filter.py +++ b/data_juicer/ops/filter/video_watermark_filter.py @@ -1,25 +1,19 @@ +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS OP_NAME = 'video_watermark_filter' -with AvailabilityChecking(['torch', 'transformers'], OP_NAME): - - import torch - import transformers # noqa: F401 - - # avoid hanging when calling watermark detection in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') @OPERATORS.register_module(OP_NAME) @@ -76,6 +70,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers']) self.prob_threshold = prob_threshold if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( diff --git a/data_juicer/ops/filter/word_repetition_filter.py b/data_juicer/ops/filter/word_repetition_filter.py index 3e9cad251..41a081694 100644 --- a/data_juicer/ops/filter/word_repetition_filter.py +++ b/data_juicer/ops/filter/word_repetition_filter.py @@ -4,20 +4,16 @@ from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'word_repetition_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) @@ -51,6 +47,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.n = rep_len self.min_ratio = min_ratio self.max_ratio = max_ratio diff --git a/data_juicer/ops/filter/words_num_filter.py b/data_juicer/ops/filter/words_num_filter.py index 978c252ad..413a2171d 100644 --- a/data_juicer/ops/filter/words_num_filter.py +++ b/data_juicer/ops/filter/words_num_filter.py @@ -1,19 +1,15 @@ import sys -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, InterVars, StatsKeys from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Filter +from ..base_op import AUTOINSTALL, OPERATORS, Filter from ..common import (SPECIAL_CHARACTERS, get_words_from_document, words_refinement) from ..op_fusion import INTER_WORDS OP_NAME = 'words_num_filter' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) @INTER_WORDS.register_module(OP_NAME) @@ -45,6 +41,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.min_num = min_num self.max_num = max_num self.model_key = None diff --git a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py index 0c5341662..b6434c0f4 100644 --- a/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.py @@ -1,16 +1,17 @@ from typing import Dict, List, Optional -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy + from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'audio_ffmpeg_wrapped_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +with HiddenPrints(): + ffmpeg = lazy.load('ffmpeg') @OPERATORS.register_module(OP_NAME) @@ -40,6 +41,7 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) self.filter_name = filter_name diff --git a/data_juicer/ops/mapper/chinese_convert_mapper.py b/data_juicer/ops/mapper/chinese_convert_mapper.py index 8e6bb9dc1..e18fa0afc 100644 --- a/data_juicer/ops/mapper/chinese_convert_mapper.py +++ b/data_juicer/ops/mapper/chinese_convert_mapper.py @@ -1,11 +1,10 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'chinese_convert_mapper' -with AvailabilityChecking(['opencc'], OP_NAME): - import opencc # noqa: F401 +opencc = lazy.load('opencc') OPENCC_CONVERTER = None @@ -75,6 +74,7 @@ def __init__(self, mode: str = 's2t', *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencc']) mode_list = [ 's2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t' diff --git a/data_juicer/ops/mapper/clean_html_mapper.py b/data_juicer/ops/mapper/clean_html_mapper.py index 09e847dd0..477c46846 100644 --- a/data_juicer/ops/mapper/clean_html_mapper.py +++ b/data_juicer/ops/mapper/clean_html_mapper.py @@ -2,14 +2,13 @@ # https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/ # -------------------------------------------------------- -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'clean_html_mapper' -with AvailabilityChecking(['selectolax'], OP_NAME): - from selectolax.parser import HTMLParser +selectolax = lazy.load('selectolax') @OPERATORS.register_module(OP_NAME) @@ -26,6 +25,7 @@ def __init__(self, *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['selectolax']) def process(self, samples): @@ -34,7 +34,7 @@ def _clean_html(raw_html): raw_html = raw_html.replace('', '') raw_html = raw_html.replace('
    ', '\n*') raw_html = raw_html.replace('
', '') - parser = HTMLParser(raw_html) + parser = selectolax.parser.HTMLParser(raw_html) return parser.text() samples[self.text_key] = [ diff --git a/data_juicer/ops/mapper/fix_unicode_mapper.py b/data_juicer/ops/mapper/fix_unicode_mapper.py index b44005076..e2323c3b9 100644 --- a/data_juicer/ops/mapper/fix_unicode_mapper.py +++ b/data_juicer/ops/mapper/fix_unicode_mapper.py @@ -1,11 +1,10 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'fix_unicode_mapper' -with AvailabilityChecking(['ftfy'], OP_NAME): - import ftfy +ftfy = lazy.load('ftfy') @OPERATORS.register_module(OP_NAME) @@ -25,6 +24,7 @@ def __init__(self, normalization: str = None, *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['ftfy']) if normalization and len(normalization) > 0: self.normalization = normalization.upper() else: diff --git a/data_juicer/ops/mapper/image_captioning_mapper.py b/data_juicer/ops/mapper/image_captioning_mapper.py index 0e3b3a39c..5f04fa97f 100644 --- a/data_juicer/ops/mapper/image_captioning_mapper.py +++ b/data_juicer/ops/mapper/image_captioning_mapper.py @@ -6,7 +6,6 @@ from loguru import logger from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.mm_utils import (SpecialTokens, insert_texts_after_placeholders, @@ -14,20 +13,11 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_captioning_mapper' -with AvailabilityChecking(['torch', 'transformers', 'simhash-pybind'], - OP_NAME): - import simhash # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling model in multiprocessing - torch.set_num_threads(1) - @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) @@ -89,6 +79,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) if keep_candidate_mode not in [ 'random_any', 'similar_one_simhash', 'all' diff --git a/data_juicer/ops/mapper/image_diffusion_mapper.py b/data_juicer/ops/mapper/image_diffusion_mapper.py index bd8702b5c..a69d8ac6a 100644 --- a/data_juicer/ops/mapper/image_diffusion_mapper.py +++ b/data_juicer/ops/mapper/image_diffusion_mapper.py @@ -6,28 +6,17 @@ from pydantic import Field, PositiveInt from typing_extensions import Annotated -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context, load_image, remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_diffusion_mapper' -check_list = ['diffusers', 'torch', 'transformers', 'simhash-pybind'] -with AvailabilityChecking(check_list, OP_NAME): - import diffusers # noqa: F401 - import simhash # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling stable diffusion in multiprocessing - torch.set_num_threads(1) - @OPERATORS.register_module(OP_NAME) @LOADED_IMAGES.register_module(OP_NAME) @@ -103,6 +92,8 @@ def __init__(self, caption_key is None. """ super().__init__(*args, **kwargs) + AUTOINSTALL.check( + ['diffusers', 'torch', 'transformers', 'simhash-pybind']) self._init_parameters = self.remove_extra_parameters(locals()) self.strength = strength self.guidance_scale = guidance_scale diff --git a/data_juicer/ops/mapper/image_face_blur_mapper.py b/data_juicer/ops/mapper/image_face_blur_mapper.py index e47da2e4e..e3d37e21b 100644 --- a/data_juicer/ops/mapper/image_face_blur_mapper.py +++ b/data_juicer/ops/mapper/image_face_blur_mapper.py @@ -1,23 +1,22 @@ import os +import lazy_loader as lazy from loguru import logger from pydantic import NonNegativeFloat -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context, load_image) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, UNFORKABLE, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Mapper from ..op_fusion import LOADED_IMAGES OP_NAME = 'image_face_blur_mapper' -with AvailabilityChecking(['opencv-python', 'Pillow'], OP_NAME): - import cv2 - from PIL import ImageFilter +cv2 = lazy.load('cv2') +PIL = lazy.load('PIL') @UNFORKABLE.register_module(OP_NAME) @@ -52,6 +51,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencv-python', 'Pillow']) self._init_parameters = self.remove_extra_parameters(locals()) if cv_classifier == '': @@ -65,11 +65,11 @@ def __init__(self, raise ValueError('Radius must be >= 0. ') if blur_type == 'mean': - self.blur = ImageFilter.BLUR + self.blur = PIL.ImageFilter.BLUR elif blur_type == 'box': - self.blur = ImageFilter.BoxBlur(radius) + self.blur = PIL.ImageFilter.BoxBlur(radius) else: - self.blur = ImageFilter.GaussianBlur(radius) + self.blur = PIL.ImageFilter.GaussianBlur(radius) self.blur_type = blur_type self.radius = radius diff --git a/data_juicer/ops/mapper/nlpaug_en_mapper.py b/data_juicer/ops/mapper/nlpaug_en_mapper.py index 3ec5864c7..9a253c9c2 100644 --- a/data_juicer/ops/mapper/nlpaug_en_mapper.py +++ b/data_juicer/ops/mapper/nlpaug_en_mapper.py @@ -1,19 +1,17 @@ from copy import deepcopy +import lazy_loader as lazy from loguru import logger from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking - -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'nlpaug_en_mapper' -with AvailabilityChecking(['nlpaug'], OP_NAME): - import nlpaug.augmenter.char as nac - import nlpaug.augmenter.word as naw - import nlpaug.flow as naf - from nlpaug.util import Action +nlpaug = lazy.load('nlpaug') +nac = lazy.load('nlpaug.augmenter.char') +naw = lazy.load('nlpaug.augmenter.word') +naf = lazy.load('nlpaug.flow') @OPERATORS.register_module(OP_NAME) @@ -87,6 +85,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['nlpaug']) self.aug_num = aug_num if aug_num >= 10: @@ -98,6 +97,7 @@ def __init__(self, aug_pipeline = [] # word level + Action = nlpaug.util.Action if delete_random_word: aug_pipeline.append(naw.RandomWordAug(action=Action.DELETE)) if swap_random_word: diff --git a/data_juicer/ops/mapper/nlpcda_zh_mapper.py b/data_juicer/ops/mapper/nlpcda_zh_mapper.py index 640ea7391..adc718beb 100644 --- a/data_juicer/ops/mapper/nlpcda_zh_mapper.py +++ b/data_juicer/ops/mapper/nlpcda_zh_mapper.py @@ -1,17 +1,16 @@ from copy import deepcopy +import lazy_loader as lazy from loguru import logger from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.logger_utils import HiddenPrints -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'nlpcda_zh_mapper' -with AvailabilityChecking(['nlpcda'], OP_NAME), HiddenPrints(): - import nlpcda +nlpcda = lazy.load('nlpcda') @OPERATORS.register_module(OP_NAME) @@ -71,6 +70,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['nlpcda']) self.aug_num = aug_num if aug_num >= 10: diff --git a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py index 97bb61319..eea948097 100644 --- a/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py +++ b/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.py @@ -1,18 +1,14 @@ from typing import List, Optional -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..common import (SPECIAL_CHARACTERS, get_words_from_document, merge_on_whitespace_tab_newline, split_on_newline_tab_whitespace, strip) OP_NAME = 'remove_words_with_incorrect_substrings_mapper' -with AvailabilityChecking(['sentencepiece'], OP_NAME): - import sentencepiece # noqa: F401 - @OPERATORS.register_module(OP_NAME) class RemoveWordsWithIncorrectSubstringsMapper(Mapper): @@ -38,6 +34,7 @@ def __init__(self, if substrings is None: substrings = ['http', 'www', '.com', 'href', '//'] super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece']) self.tokenization = tokenization self.substrings = substrings self.lang = lang diff --git a/data_juicer/ops/mapper/sentence_split_mapper.py b/data_juicer/ops/mapper/sentence_split_mapper.py index 819cfd55c..c71479de4 100644 --- a/data_juicer/ops/mapper/sentence_split_mapper.py +++ b/data_juicer/ops/mapper/sentence_split_mapper.py @@ -1,14 +1,10 @@ -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..common import get_sentences_from_document OP_NAME = 'sentence_split_mapper' -with AvailabilityChecking(['nltk'], OP_NAME): - import nltk # noqa: F401 - @OPERATORS.register_module(OP_NAME) class SentenceSplitMapper(Mapper): @@ -25,6 +21,7 @@ def __init__(self, lang: str = 'en', *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['nltk']) self.lang = lang self.model_key = prepare_model(model_type='nltk', lang=lang) diff --git a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py index 9b6bc5330..9eaa960e7 100644 --- a/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_audio_mapper.py @@ -3,24 +3,12 @@ import regex as re -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.mm_utils import SpecialTokens, extract_audio_from_video from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper NAME = 'video_captioning_from_audio_mapper' -CHECK_PKGS = [ - 'transformers', 'transformers_stream_generator', 'einops', 'accelerate', - 'tiktoken' -] - -with AvailabilityChecking(CHECK_PKGS, NAME): - import accelerate # noqa: F401 - import einops # noqa: F401 - import tiktoken # noqa: F401 - import transformers # noqa: F401 - import transformers_stream_generator # noqa: F401 @OPERATORS.register_module(NAME) @@ -44,6 +32,10 @@ def __init__(self, keep_original_sample: bool = True, *args, **kwargs): :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check([ + 'transformers', 'transformers_stream_generator', 'einops', + 'accelerate', 'tiktoken' + ]) self.keep_original_sample = keep_original_sample self.extra_args = kwargs diff --git a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py index baa3a4b5f..08eee2add 100644 --- a/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_frames_mapper.py @@ -8,7 +8,6 @@ from PIL import ImageOps from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, @@ -19,21 +18,11 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_captioning_from_frames_mapper' -with AvailabilityChecking(['torch', 'transformers', 'simhash-pybind'], - OP_NAME): - - import simhash # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @@ -117,6 +106,7 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) if keep_candidate_mode not in [ 'random_any', 'similar_one_simhash', 'all' diff --git a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py index 3cf0ef618..02cf781a0 100644 --- a/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.py @@ -3,43 +3,13 @@ from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import SpecialTokens, remove_special_tokens from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper NAME = 'video_captioning_from_summarizer_mapper' -CHECK_PKGS = [ - 'torch', - 'transformers', - 'simhash-pybind', # by video caption - 'transformers_stream_generator', - 'einops', - 'accelerate', - 'tiktoken', # by audio caption - 'torchaudio', # by audio tag - 'git+https://github.com/xinyu1205/recognize-anything.git', # by frame tag -] - -with AvailabilityChecking(CHECK_PKGS, NAME): - # video caption - # audio caption - import accelerate # noqa: F401 - import einops # noqa: F401 - # frame tag - import ram # noqa: F401 - import simhash # noqa: F401 - import tiktoken # noqa: F401 - import torch - # audio tag - import torchaudio # noqa: F401 - import transformers # noqa: F401 - import transformers_stream_generator # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) @OPERATORS.register_module(NAME) @@ -111,6 +81,17 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check([ + 'torch', + 'transformers', + 'simhash-pybind', # by video caption + 'transformers_stream_generator', + 'einops', + 'accelerate', + 'tiktoken', # by audio caption + 'torchaudio', # by audio tag + 'ram@git+https://github.com/xinyu1205/recognize-anything.git' + ]) self.keep_original_sample = keep_original_sample self.extra_args = kwargs diff --git a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py index e697bf0cc..ebb26c53b 100644 --- a/data_juicer/ops/mapper/video_captioning_from_video_mapper.py +++ b/data_juicer/ops/mapper/video_captioning_from_video_mapper.py @@ -8,7 +8,6 @@ from PIL import ImageOps from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys from data_juicer.utils.mm_utils import (SpecialTokens, close_video, extract_key_frames, @@ -19,21 +18,11 @@ remove_special_tokens) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_captioning_from_video_mapper' -with AvailabilityChecking(['torch', 'transformers', 'simhash-pybind'], - OP_NAME): - - import simhash # noqa: F401 - import torch - import transformers # noqa: F401 - - # avoid hanging when calling clip in multiprocessing - torch.set_num_threads(1) - @OPERATORS.register_module(OP_NAME) @LOADED_VIDEOS.register_module(OP_NAME) @@ -117,6 +106,7 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers', 'simhash-pybind']) if keep_candidate_mode not in [ 'random_any', 'similar_one_simhash', 'all' diff --git a/data_juicer/ops/mapper/video_face_blur_mapper.py b/data_juicer/ops/mapper/video_face_blur_mapper.py index a862cd8e0..f30917536 100644 --- a/data_juicer/ops/mapper/video_face_blur_mapper.py +++ b/data_juicer/ops/mapper/video_face_blur_mapper.py @@ -1,8 +1,8 @@ import os import av +import lazy_loader as lazy -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.mm_utils import (close_video, detect_faces, @@ -10,14 +10,13 @@ process_each_frame) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, UNFORKABLE, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Mapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_face_blur_mapper' -with AvailabilityChecking(['opencv-python', 'Pillow'], OP_NAME): - import cv2 - from PIL import ImageFilter +cv2 = lazy.load('cv2') +PIL = lazy.load('PIL') @UNFORKABLE.register_module(OP_NAME) @@ -52,6 +51,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencv-python', 'Pillow']) self._init_parameters = self.remove_extra_parameters(locals()) if cv_classifier == '': @@ -65,11 +65,11 @@ def __init__(self, raise ValueError('Radius must be >= 0. ') if blur_type == 'mean': - self.blur = ImageFilter.BLUR + self.blur = PIL.ImageFilter.BLUR elif blur_type == 'box': - self.blur = ImageFilter.BoxBlur(radius) + self.blur = PIL.ImageFilter.BoxBlur(radius) else: - self.blur = ImageFilter.GaussianBlur(radius) + self.blur = PIL.ImageFilter.GaussianBlur(radius) self.blur_type = blur_type self.radius = radius diff --git a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py index 6a5a38bcc..c711a6ae8 100644 --- a/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py +++ b/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.py @@ -1,16 +1,17 @@ from typing import Dict, List, Optional -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy + from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'video_ffmpeg_wrapped_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +with HiddenPrints(): + ffmpeg = lazy.load('ffmpeg') @OPERATORS.register_module(OP_NAME) @@ -40,6 +41,7 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) self.filter_name = filter_name diff --git a/data_juicer/ops/mapper/video_remove_watermark_mapper.py b/data_juicer/ops/mapper/video_remove_watermark_mapper.py index 24924a7cd..2c3166e8b 100644 --- a/data_juicer/ops/mapper/video_remove_watermark_mapper.py +++ b/data_juicer/ops/mapper/video_remove_watermark_mapper.py @@ -2,10 +2,10 @@ from typing import List, Optional import av +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints @@ -15,13 +15,13 @@ parse_string_to_roi, process_each_frame) -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_remove_watermark_mapper' -with AvailabilityChecking(['opencv-python'], OP_NAME), HiddenPrints(): - import cv2 as cv +with HiddenPrints(): + cv2 = lazy.load('cv2') @OPERATORS.register_module(OP_NAME) @@ -69,6 +69,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['opencv-python']) self._init_parameters = self.remove_extra_parameters(locals()) if roi_type not in ['ratio', 'pixel']: @@ -113,9 +114,9 @@ def _detect_watermark_via_pixel_value(self, frames, rois): for roi in rois: # dimension of ndarray frame: height x width x channel roi_frame = frame[roi[1]:roi[3], roi[0]:roi[2]] - gray_frame = cv.cvtColor(roi_frame, cv.COLOR_BGR2GRAY) - _, binary_frame = cv.threshold( - gray_frame, 0, 255, cv.THRESH_BINARY + cv.THRESH_OTSU) + gray_frame = cv2.cvtColor(roi_frame, cv2.COLOR_BGR2GRAY) + _, binary_frame = cv2.threshold( + gray_frame, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # assume the watermark is located in the box, so the pixel in # the edge must be 0, if not, reverse binary_frame @@ -154,8 +155,8 @@ def _detect_watermark_via_pixel_diversity(self, frames, rois): else: scaled_diversity = np.zeros_like(pixel_diversity) scaled_diversity = scaled_diversity.astype(np.uint8) - _, binary_frame = cv.threshold(scaled_diversity, 0, 255, - cv.THRESH_BINARY + cv.THRESH_OTSU) + _, binary_frame = cv2.threshold( + scaled_diversity, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) # the watermark pixels have less diversity binary_frame = ~binary_frame mask[roi[1]:roi[3], @@ -194,11 +195,11 @@ def _generate_watermark_mask(self, video, sample): mask = self._detect_watermark_via_pixel_diversity(frames, rois) kernel = np.ones((5, 5), np.uint8) - return cv.dilate(mask, kernel) + return cv2.dilate(mask, kernel) def _clean_watermark(self, frame, watermark_mask): np_frame = frame.to_ndarray(format='bgr24') - new_np_frame = cv.inpaint(np_frame, watermark_mask, 3, cv.INPAINT_NS) + new_np_frame = cv2.inpaint(np_frame, watermark_mask, 3, cv2.INPAINT_NS) return av.VideoFrame.from_ndarray(new_np_frame, format='bgr24') def process(self, sample, context=False): diff --git a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py index e226c2651..99192c9c1 100644 --- a/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py +++ b/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.py @@ -2,18 +2,19 @@ import os from fractions import Fraction -from data_juicer.utils.availability_utils import AvailabilityChecking +import lazy_loader as lazy + from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints from data_juicer.utils.mm_utils import close_video, load_video -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'video_resize_aspect_ratio_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +with HiddenPrints(): + ffmpeg = lazy.load('ffmpeg') def rescale(width, height, ori_ratio, min_ratio, max_ratio, strategy): @@ -88,6 +89,7 @@ def __init__( :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) strategy = strategy.lower() diff --git a/data_juicer/ops/mapper/video_resize_resolution_mapper.py b/data_juicer/ops/mapper/video_resize_resolution_mapper.py index 5f60d14f3..574dd04d6 100644 --- a/data_juicer/ops/mapper/video_resize_resolution_mapper.py +++ b/data_juicer/ops/mapper/video_resize_resolution_mapper.py @@ -2,21 +2,21 @@ import os import sys +import lazy_loader as lazy from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints from data_juicer.utils.mm_utils import close_video, load_video -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_resize_resolution_mapper' -with AvailabilityChecking(['ffmpeg-python'], OP_NAME), HiddenPrints(): - import ffmpeg +with HiddenPrints(): + ffmpeg = lazy.load('ffmpeg') @OPERATORS.register_module(OP_NAME) @@ -59,6 +59,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['ffmpeg-python']) self._init_parameters = self.remove_extra_parameters(locals()) force_original_aspect_ratio = force_original_aspect_ratio.lower() diff --git a/data_juicer/ops/mapper/video_split_by_scene_mapper.py b/data_juicer/ops/mapper/video_split_by_scene_mapper.py index 4b2e39165..7ce921e09 100644 --- a/data_juicer/ops/mapper/video_split_by_scene_mapper.py +++ b/data_juicer/ops/mapper/video_split_by_scene_mapper.py @@ -2,21 +2,19 @@ import re from itertools import chain +import lazy_loader as lazy from pydantic import NonNegativeFloat, NonNegativeInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import (add_suffix_to_filename, transfer_filename) from data_juicer.utils.mm_utils import SpecialTokens -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'video_split_by_scene_mapper' -with AvailabilityChecking(['scenedetect[opencv]'], OP_NAME): - import scenedetect.detectors - from scenedetect import detect, split_video_ffmpeg +scenedetect = lazy.load('scenedetect') def replace_func(match, scene_counts_iter): @@ -62,6 +60,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['scenedetect[opencv]']) self._init_parameters = self.remove_extra_parameters(locals()) if detector not in self.avaliable_detectors: @@ -107,10 +106,10 @@ def process(self, sample, context=False): # detect scenes detector = self.detector_class(self.threshold, self.min_scene_len, **self.detector_kwargs) - scene_list = detect(video_key, - detector, - show_progress=self.show_progress, - start_in_scene=True) + scene_list = scenedetect.detect(video_key, + detector, + show_progress=self.show_progress, + start_in_scene=True) scene_counts[video_key] = len(scene_list) if len(scene_list) > 1: @@ -122,10 +121,11 @@ def process(self, sample, context=False): for i in range(len(scene_list)) ] # split video into clips - split_video_ffmpeg(input_video_path=video_key, - scene_list=scene_list, - output_file_template=output_template, - show_progress=self.show_progress) + scenedetect.split_video_ffmpeg( + input_video_path=video_key, + scene_list=scene_list, + output_file_template=output_template, + show_progress=self.show_progress) else: output_video_keys[video_key] = [video_key] diff --git a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py index c9f0536e2..164fc46bd 100644 --- a/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_audio_mapper.py @@ -1,22 +1,16 @@ +import lazy_loader as lazy import librosa import numpy as np -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import extract_audio_from_video from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, Mapper OP_NAME = 'video_tagging_from_audio_mapper' -with AvailabilityChecking(['torch', 'transformers', 'torchaudio'], OP_NAME): - import torch - import torchaudio # noqa: F401 - import transformers # noqa: F401 - - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) +torch = lazy.load('torch') @OPERATORS.register_module(OP_NAME) @@ -44,6 +38,7 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check(['torch', 'transformers', 'torchaudio']) self.model_key = prepare_model(model_type='huggingface', pretrained_model_name_or_path=hf_ast, trust_remote_code=trust_remote_code) diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py index 8aafdb615..99cb96aa4 100644 --- a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py +++ b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py @@ -1,28 +1,22 @@ from collections import Counter +import lazy_loader as lazy import numpy as np from pydantic import PositiveInt -from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model -from ..base_op import OPERATORS, UNFORKABLE, Mapper +from ..base_op import AUTOINSTALL, OPERATORS, UNFORKABLE, Mapper from ..op_fusion import LOADED_VIDEOS OP_NAME = 'video_tagging_from_frames_mapper' -with AvailabilityChecking( - ['torch', 'git+https://github.com/xinyu1205/recognize-anything.git'], - OP_NAME): - import ram # noqa: F401 - import torch - - # avoid hanging when calling recognizeAnything in multiprocessing - torch.set_num_threads(1) +ram = lazy.load('ram') +torch = lazy.load('torch') @UNFORKABLE.register_module(OP_NAME) @@ -62,6 +56,10 @@ def __init__(self, :param kwargs: extra args """ super().__init__(*args, **kwargs) + AUTOINSTALL.check([ + 'torch', + 'ram@git+https://github.com/xinyu1205/recognize-anything.git' + ]) if frame_sampling_method not in ['all_keyframes', 'uniform']: raise ValueError( f'Frame sampling method [{frame_sampling_method}] is not ' @@ -72,8 +70,7 @@ def __init__(self, input_size=384) self.frame_sampling_method = frame_sampling_method self.frame_num = frame_num - from ram import get_transform - self.transform = get_transform(image_size=384) + self.transform = ram.get_transform(image_size=384) self.tag_field_name = tag_field_name diff --git a/data_juicer/utils/auto_install_utils.py b/data_juicer/utils/auto_install_utils.py new file mode 100644 index 000000000..038637879 --- /dev/null +++ b/data_juicer/utils/auto_install_utils.py @@ -0,0 +1,84 @@ +import importlib +import os +import platform +import subprocess +import sys + +from loguru import logger + +CHECK_SYSTEM_INFO_ONCE = True + + +def _is_package_installed(package_name): + if '@' in package_name: + package_name = package_name.split('@')[0] + if '[' in package_name: + package_name = package_name.split('[')[0] + try: + subprocess.check_output( + [sys.executable, '-m', 'pip', 'show', package_name], + stderr=subprocess.STDOUT) + return True + except subprocess.CalledProcessError: + return False + + +def _torch_check_and_set(): + # only for python3.8 on mac + global CHECK_SYSTEM_INFO_ONCE + if CHECK_SYSTEM_INFO_ONCE and importlib.util.find_spec( + 'torch') is not None: + major, minor = sys.version_info[:2] + system = platform.system() + if major == 3 and minor == 8 and system == 'Darwin': + logger.warning( + 'The torch.set_num_threads function does not ' + 'work in python3.8 version on Mac systems. We will set ' + 'OMP_NUM_THREADS to 1 manually before importing torch') + + os.environ['OMP_NUM_THREADS'] = str(1) + CHECK_SYSTEM_INFO_ONCE = False + import torch + + # avoid hanging when calling clip in multiprocessing + torch.set_num_threads(1) + + +class AutoInstaller(object): + """ + This class is used to install the required + package automatically. + """ + + def __init__(self, require_f_paths=[]): + """ + Initialization method. + + :param require_f_paths: paths to the file for version limitation + """ + self.version_map, reqs = {}, [] + for path in require_f_paths: + if not os.path.exists(path): + logger.warning(f'target file does not exist: {path}') + else: + with open(path, 'r', encoding='utf-8') as fin: + reqs += [x.strip() for x in fin.read().splitlines()] + for req in reqs: + clean_req = req.replace('<', ' ').replace('>', ' ').replace( + '=', ' ').split(' ')[0] + self.version_map[clean_req] = req + + def check(self, check_pkgs): + """ + install if the package is not importable. + """ + for pkg in check_pkgs: + if not _is_package_installed(pkg): + logger.info(f'Installing {pkg} ...') + if pkg in self.version_map: + pkg = self.version_map[pkg] + pip_cmd = [sys.executable, '-m', 'pip', 'install', pkg] + subprocess.check_call(pip_cmd) + logger.info(f'The {pkg} installed.') + if pkg == 'torch': + _torch_check_and_set() diff --git a/data_juicer/utils/availability_utils.py b/data_juicer/utils/availability_utils.py index 7c643ccb3..5f16939af 100644 --- a/data_juicer/utils/availability_utils.py +++ b/data_juicer/utils/availability_utils.py @@ -4,8 +4,9 @@ from loguru import logger +from data_juicer.utils.auto_install_utils import _torch_check_and_set + UNAVAILABLE_OPERATORS = {} -CHECK_SYSTEM_INFO_ONCE = True class UnavailableOperator: @@ -57,23 +58,7 @@ def __init__( f'`pip install -v -e .[{self.requires_type}]`' def __enter__(self): - - # only for python3.8 on mac - global CHECK_SYSTEM_INFO_ONCE - if CHECK_SYSTEM_INFO_ONCE: - import os - import platform - import sys - major, minor = sys.version_info[:2] - system = platform.system() - if major == 3 and minor == 8 and system == 'Darwin': - logger.warning( - 'The torch.set_num_threads function does not ' - 'work in python3.8 version on Mac systems. We will set ' - 'OMP_NUM_THREADS to 1 manually before importing torch') - - os.environ['OMP_NUM_THREADS'] = str(1) - CHECK_SYSTEM_INFO_ONCE = False + _torch_check_and_set() def __exit__(self, exc_type, exc_val, exc_tb): if exc_type is ModuleNotFoundError: diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index d35d45549..03ddd2639 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -618,7 +618,6 @@ def prepare_model(model_type, **model_kwargs): assert (model_type in MODEL_FUNCTION_MAPPING.keys() ), 'model_type must be one of the following: {}'.format( list(MODEL_FUNCTION_MAPPING.keys())) - global MODEL_ZOO model_func = MODEL_FUNCTION_MAPPING[model_type] model_key = partial(model_func, **model_kwargs) # always instantiate once for possible caching @@ -653,3 +652,13 @@ def get_model(model_key=None, rank=None, use_cuda=False): rank = rank % cuda_device_count() move_to_cuda(MODEL_ZOO[model_key], rank) return MODEL_ZOO[model_key] + + +def free_models(): + global MODEL_ZOO + for model_key in MODEL_ZOO: + try: + MODEL_ZOO[model_key].to('cpu') + except Exception: + pass + MODEL_ZOO.clear() diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 604bee72d..039e48eab 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -8,6 +8,7 @@ from data_juicer import is_cuda_available from data_juicer.core.data import DJDataset, NestedDataset from data_juicer.core.ray_data import RayDataset +from data_juicer.utils.model_utils import free_models from data_juicer.utils.registry import Registry SKIPPED_TESTS = Registry('SkippedTests') @@ -59,6 +60,10 @@ def tearDownClass(cls, hf_model_name=None) -> None: print('CLEAN all TRANSFORMERS_CACHE') shutil.rmtree(transformers.TRANSFORMERS_CACHE) + @classmethod + def tearDown(cls) -> None: + free_models() + def generate_dataset(self, data) -> DJDataset: """Generate dataset for a specific executor. diff --git a/docs/DeveloperGuide.md b/docs/DeveloperGuide.md index 4bc80d1ae..42fa5d09e 100644 --- a/docs/DeveloperGuide.md +++ b/docs/DeveloperGuide.md @@ -375,6 +375,31 @@ else: ... ``` +5. As the number of OPs increases, Data-Juicer's dependencies also multiply. To prevent Data-Juicer from becoming excessively burdened with dependencies, we've implemented a strategy that incorporates lazy importing and on-demand installation of additional dependencies required by OPs. Below is an example illustrating this approach: + +```python +# ... (import some library) +from ..base_op import AUTOINSTALL +import lazy_loader as lazy + +# lazy import +kenlm = lazy.load('kenlm') +sentencepiece = lazy.load('sentencepiece') + +class PerplexityFilter(Filter): + def __init__(self, + # ... (OP parameters) + *args, + **kwargs): + # auto install before init + super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece', 'kenlm']) + # ... (some codes) + + def process(self, sample): + # ... (some codes) +``` + ## Build your own configs - We provide easy configuration based on [jsonargparse](https://github.com/omni-us/jsonargparse/) to reduce cost for boilerplate codes. diff --git a/docs/DeveloperGuide_ZH.md b/docs/DeveloperGuide_ZH.md index b3e424452..1574287f6 100644 --- a/docs/DeveloperGuide_ZH.md +++ b/docs/DeveloperGuide_ZH.md @@ -352,6 +352,31 @@ else: ... ``` +5. 随着算子数量的增加,Data-Juicer的依赖也不断增多。为了防止Data-Juicer的依赖越来越重,我们为算子额外增加的依赖提供了一套lazy import加上使用时安装依赖的策略。如下样例: + +```python +# ... (import some library) +from ..base_op import AUTOINSTALL +import lazy_loader as lazy + +# lazy import +kenlm = lazy.load('kenlm') +sentencepiece = lazy.load('sentencepiece') + +class PerplexityFilter(Filter): + def __init__(self, + # ... (OP parameters) + *args, + **kwargs): + # auto install before init + super().__init__(*args, **kwargs) + AUTOINSTALL.check(['sentencepiece', 'kenlm']) + # ... (some codes) + + def process(self, sample): + # ... (some codes) +``` + - 至此,该算子已经能够在算子融合功能开启后,自动地与其他算子进行融合并共享共有的中间变量,减少重复计算,加快整体的数据处理速度 ## 构建自己的配置 diff --git a/environments/science_requires.txt b/environments/science_requires.txt index 1a63e64e8..e51e94f02 100644 --- a/environments/science_requires.txt +++ b/environments/science_requires.txt @@ -1,6 +1,7 @@ torch>=1.11.0 torchaudio easyocr +fasttext fasttext-wheel kenlm sentencepiece diff --git a/tests/ops/filter/test_video_aesthetics_filter.py b/tests/ops/filter/test_video_aesthetics_filter.py index 7f8098853..551d0e721 100644 --- a/tests/ops/filter/test_video_aesthetics_filter.py +++ b/tests/ops/filter/test_video_aesthetics_filter.py @@ -153,7 +153,7 @@ def test_filter_keyframes(self): op = VideoAestheticsFilter(self.hf_aesthetics_scorer, min_score=0.411, max_score=0.45, - frame_sampling_method='keyframe') + frame_sampling_method='all_keyframes') self._run_video_aesthetics_filter(dataset, tgt_list, op) def test_filter_uniform_frames_with_different_frame_num(self):