diff --git a/.coveragerc b/.coveragerc new file mode 100644 index 000000000..d4a7a6d63 --- /dev/null +++ b/.coveragerc @@ -0,0 +1,11 @@ +[run] +omit = + # avoid measuring strange non-existing files + /workspace/config.py + /workspace/config-3.py + + # avoid measuring third-party dist packages + */dist-packages/* + + # avoid measuring code of unittest + tests/* diff --git a/data_juicer/ops/mapper/remove_table_text_mapper.py b/data_juicer/ops/mapper/remove_table_text_mapper.py index ff2b07a4f..87b06e5d1 100644 --- a/data_juicer/ops/mapper/remove_table_text_mapper.py +++ b/data_juicer/ops/mapper/remove_table_text_mapper.py @@ -36,8 +36,8 @@ def __init__(self, def process_batched(self, samples): for idx, text in enumerate(samples[self.text_key]): - for idx in range(self.min_col - 1, self.max_col): - pattern = re.compile(self.pattern % idx) + for i in range(self.min_col - 1, self.max_col): + pattern = re.compile(self.pattern % i) text = pattern.sub('', text) samples[self.text_key][idx] = text diff --git a/data_juicer/utils/model_utils.py b/data_juicer/utils/model_utils.py index cda046b81..dc593bc7f 100644 --- a/data_juicer/utils/model_utils.py +++ b/data_juicer/utils/model_utils.py @@ -612,7 +612,8 @@ def prepare_model(model_type, **model_kwargs): model_func = MODEL_FUNCTION_MAPPING[model_type] model_key = partial(model_func, **model_kwargs) # always instantiate once for possible caching - model_key() + if model_type != 'vllm': + model_key() return model_key diff --git a/data_juicer/utils/unittest_utils.py b/data_juicer/utils/unittest_utils.py index 81033b224..1e66c55cc 100644 --- a/data_juicer/utils/unittest_utils.py +++ b/data_juicer/utils/unittest_utils.py @@ -16,6 +16,8 @@ SKIPPED_TESTS = Registry('SkippedTests') +CLEAR_MODEL = False + def TEST_TAG(*tags): """Tags for test case. @@ -29,6 +31,15 @@ def decorator(func): return decorator +def set_clear_model_flag(flag): + global CLEAR_MODEL + CLEAR_MODEL = flag + if CLEAR_MODEL: + print('CLEAR DOWNLOADED MODELS AFTER UNITTESTS.') + else: + print('KEEP DOWNLOADED MODELS AFTER UNITTESTS.') + + class DataJuicerTestCaseBase(unittest.TestCase): @classmethod @@ -48,7 +59,9 @@ def tearDownClass(cls, hf_model_name=None) -> None: multiprocess.set_start_method(cls.original_mp_method, force=True) # clean the huggingface model cache files - if hf_model_name: + if not CLEAR_MODEL: + pass + elif hf_model_name: # given the hf model name, remove this model only model_dir = os.path.join( transformers.TRANSFORMERS_CACHE, diff --git a/environments/dev_requires.txt b/environments/dev_requires.txt index ff091a304..9793d5746 100644 --- a/environments/dev_requires.txt +++ b/environments/dev_requires.txt @@ -1,3 +1,4 @@ +coverage pre-commit sphinx sphinx-autobuild diff --git a/tests/core/test_monitor.py b/tests/core/test_monitor.py index 01840348d..3f7a35f21 100644 --- a/tests/core/test_monitor.py +++ b/tests/core/test_monitor.py @@ -4,6 +4,8 @@ from data_juicer.core import Monitor from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS +# Skip this test due to some random resource utilization fluctuation, which may +# cause failure of this test @SKIPPED_TESTS.register_module() class MonitorTest(DataJuicerTestCaseBase): diff --git a/tests/ops/filter/test_image_aesthetics_filter.py b/tests/ops/filter/test_image_aesthetics_filter.py index e20f9d2c6..3ebb8419c 100644 --- a/tests/ops/filter/test_image_aesthetics_filter.py +++ b/tests/ops/filter/test_image_aesthetics_filter.py @@ -8,7 +8,6 @@ from data_juicer.utils.constant import Fields from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class ImageAestheticsFilterTest(DataJuicerTestCaseBase): maxDiff = None diff --git a/tests/ops/filter/test_image_face_count_filter.py b/tests/ops/filter/test_image_face_count_filter.py index dd106f6bb..becb47148 100644 --- a/tests/ops/filter/test_image_face_count_filter.py +++ b/tests/ops/filter/test_image_face_count_filter.py @@ -8,7 +8,6 @@ from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class ImageFaceCountFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_image_face_ratio_filter.py b/tests/ops/filter/test_image_face_ratio_filter.py index d82ac2ec1..69a10a42d 100644 --- a/tests/ops/filter/test_image_face_ratio_filter.py +++ b/tests/ops/filter/test_image_face_ratio_filter.py @@ -8,7 +8,6 @@ from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class ImageFaceRatioFilterTest(DataJuicerTestCaseBase): maxDiff = None diff --git a/tests/ops/filter/test_image_nsfw_filter.py b/tests/ops/filter/test_image_nsfw_filter.py index 0a588e272..87f24e94b 100644 --- a/tests/ops/filter/test_image_nsfw_filter.py +++ b/tests/ops/filter/test_image_nsfw_filter.py @@ -10,7 +10,6 @@ from data_juicer.utils.constant import Fields from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class ImageNSFWFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_image_text_matching_filter.py b/tests/ops/filter/test_image_text_matching_filter.py index 0551da254..91ed938df 100644 --- a/tests/ops/filter/test_image_text_matching_filter.py +++ b/tests/ops/filter/test_image_text_matching_filter.py @@ -11,7 +11,6 @@ from data_juicer.utils.mm_utils import SpecialTokens from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class ImageTextMatchingFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_image_watermark_filter.py b/tests/ops/filter/test_image_watermark_filter.py index 01ed2e0dc..b5e5146f8 100644 --- a/tests/ops/filter/test_image_watermark_filter.py +++ b/tests/ops/filter/test_image_watermark_filter.py @@ -10,7 +10,6 @@ from data_juicer.utils.constant import Fields from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class ImageWatermarkFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_phrase_grounding_recall_filter.py b/tests/ops/filter/test_phrase_grounding_recall_filter.py index e865c2f22..201e71214 100644 --- a/tests/ops/filter/test_phrase_grounding_recall_filter.py +++ b/tests/ops/filter/test_phrase_grounding_recall_filter.py @@ -11,7 +11,6 @@ from data_juicer.utils.mm_utils import SpecialTokens from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class PhraseGroundingRecallFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_video_aesthetics_filter.py b/tests/ops/filter/test_video_aesthetics_filter.py index 551d0e721..b0681ef4d 100644 --- a/tests/ops/filter/test_video_aesthetics_filter.py +++ b/tests/ops/filter/test_video_aesthetics_filter.py @@ -9,7 +9,6 @@ from data_juicer.utils.mm_utils import SpecialTokens from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class VideoAestheticsFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_video_nsfw_filter.py b/tests/ops/filter/test_video_nsfw_filter.py index 3c713407d..8376eb7af 100644 --- a/tests/ops/filter/test_video_nsfw_filter.py +++ b/tests/ops/filter/test_video_nsfw_filter.py @@ -10,7 +10,6 @@ from data_juicer.utils.constant import Fields from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class VideoNSFWFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_video_ocr_area_ratio_filter.py b/tests/ops/filter/test_video_ocr_area_ratio_filter.py index 9884ab1cf..b7c7e6f50 100644 --- a/tests/ops/filter/test_video_ocr_area_ratio_filter.py +++ b/tests/ops/filter/test_video_ocr_area_ratio_filter.py @@ -8,7 +8,6 @@ from data_juicer.utils.constant import Fields from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class VideoOcrAreaRatioFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/filter/test_video_tagging_from_frames_filter.py b/tests/ops/filter/test_video_tagging_from_frames_filter.py index c16b07d4d..545be9748 100644 --- a/tests/ops/filter/test_video_tagging_from_frames_filter.py +++ b/tests/ops/filter/test_video_tagging_from_frames_filter.py @@ -8,7 +8,6 @@ from data_juicer.utils.mm_utils import SpecialTokens from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class VideoTaggingFromFramesFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', 'data') diff --git a/tests/ops/filter/test_video_watermark_filter.py b/tests/ops/filter/test_video_watermark_filter.py index aca75131f..629319e3f 100644 --- a/tests/ops/filter/test_video_watermark_filter.py +++ b/tests/ops/filter/test_video_watermark_filter.py @@ -10,7 +10,6 @@ from data_juicer.utils.constant import Fields from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class VideoWatermarkFilterTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/mapper/test_extract_qa_mapper.py b/tests/ops/mapper/test_extract_qa_mapper.py index 2e1b59a78..415efad4e 100644 --- a/tests/ops/mapper/test_extract_qa_mapper.py +++ b/tests/ops/mapper/test_extract_qa_mapper.py @@ -4,7 +4,7 @@ from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to unknown DistNetworkError. # These tests have been tested locally. @SKIPPED_TESTS.register_module() class ExtractQAMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_generate_instruction_mapper.py b/tests/ops/mapper/test_generate_instruction_mapper.py index 43bd31262..a250fbcc4 100644 --- a/tests/ops/mapper/test_generate_instruction_mapper.py +++ b/tests/ops/mapper/test_generate_instruction_mapper.py @@ -4,7 +4,7 @@ from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to unknown DistNetworkError. # These tests have been tested locally. @SKIPPED_TESTS.register_module() class GenerateInstructionMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_image_captioning_mapper.py b/tests/ops/mapper/test_image_captioning_mapper.py index c4c3d1e3e..2a772ab20 100644 --- a/tests/ops/mapper/test_image_captioning_mapper.py +++ b/tests/ops/mapper/test_image_captioning_mapper.py @@ -9,7 +9,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to OOM on the current runner # These tests have been tested locally. @SKIPPED_TESTS.register_module() class ImageCaptioningMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_image_diffusion_mapper.py b/tests/ops/mapper/test_image_diffusion_mapper.py index ad241732f..5883a7ff7 100644 --- a/tests/ops/mapper/test_image_diffusion_mapper.py +++ b/tests/ops/mapper/test_image_diffusion_mapper.py @@ -10,7 +10,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to OOM on the current runner # These tests have been tested locally. @SKIPPED_TESTS.register_module() class ImageDiffusionMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_image_tagging_mapper.py b/tests/ops/mapper/test_image_tagging_mapper.py index e9609b12f..06fd6fba7 100644 --- a/tests/ops/mapper/test_image_tagging_mapper.py +++ b/tests/ops/mapper/test_image_tagging_mapper.py @@ -8,6 +8,8 @@ from data_juicer.utils.constant import Fields from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS +# Skip tests for this OP in the GitHub actions due to OOM on the current runner +# These tests have been tested locally. @SKIPPED_TESTS.register_module() class ImageTaggingMapperTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/mapper/test_nlpcda_zh_mapper.py b/tests/ops/mapper/test_nlpcda_zh_mapper.py index 3624a9c35..ce21ea55d 100644 --- a/tests/ops/mapper/test_nlpcda_zh_mapper.py +++ b/tests/ops/mapper/test_nlpcda_zh_mapper.py @@ -4,9 +4,12 @@ from data_juicer.core import NestedDataset as Dataset from data_juicer.ops.mapper.nlpcda_zh_mapper import NlpcdaZhMapper -from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase +from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS +# Skip tests for this OP in the GitHub actions due to unknown UnicodeEncodeError +# These tests have been tested locally. +@SKIPPED_TESTS.register_module() class NlpaugEnMapperTest(DataJuicerTestCaseBase): def setUp(self): diff --git a/tests/ops/mapper/test_optimize_instruction_mapper.py b/tests/ops/mapper/test_optimize_instruction_mapper.py index 7c7b58b4c..4b3e4562b 100644 --- a/tests/ops/mapper/test_optimize_instruction_mapper.py +++ b/tests/ops/mapper/test_optimize_instruction_mapper.py @@ -3,7 +3,7 @@ from data_juicer.utils.unittest_utils import (SKIPPED_TESTS, DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to unknown DistNetworkError. # These tests have been tested locally. @SKIPPED_TESTS.register_module() class OptimizeInstructionMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_video_captioning_from_audio_mapper.py b/tests/ops/mapper/test_video_captioning_from_audio_mapper.py index caadeb97b..402509639 100644 --- a/tests/ops/mapper/test_video_captioning_from_audio_mapper.py +++ b/tests/ops/mapper/test_video_captioning_from_audio_mapper.py @@ -9,7 +9,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to OOM on the current runner # These tests have been tested locally. @SKIPPED_TESTS.register_module() class VideoCaptioningFromAudioMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py index 71bf963f6..d9bf29724 100644 --- a/tests/ops/mapper/test_video_captioning_from_frames_mapper.py +++ b/tests/ops/mapper/test_video_captioning_from_frames_mapper.py @@ -9,7 +9,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to OOM on the current runner # These tests have been tested locally. @SKIPPED_TESTS.register_module() class VideoCaptioningFromFramesMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py b/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py index 79f8037b9..016a4d73b 100644 --- a/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py +++ b/tests/ops/mapper/test_video_captioning_from_summarizer_mapper.py @@ -9,7 +9,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to OOM on the current runner # These tests have been tested locally. @SKIPPED_TESTS.register_module() class VideoCaptioningFromSummarizerMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_video_captioning_from_video_mapper.py b/tests/ops/mapper/test_video_captioning_from_video_mapper.py index 012761af5..f3de27226 100644 --- a/tests/ops/mapper/test_video_captioning_from_video_mapper.py +++ b/tests/ops/mapper/test_video_captioning_from_video_mapper.py @@ -9,7 +9,7 @@ DataJuicerTestCaseBase) -# Skip tests for this OP in the GitHub actions due to disk space limitation. +# Skip tests for this OP in the GitHub actions due to OOM on the current runner # These tests have been tested locally. @SKIPPED_TESTS.register_module() class VideoCaptioningFromVideoMapperTest(DataJuicerTestCaseBase): diff --git a/tests/ops/mapper/test_video_remove_watermark_mapper.py b/tests/ops/mapper/test_video_remove_watermark_mapper.py index 0cfefa76f..96231a976 100644 --- a/tests/ops/mapper/test_video_remove_watermark_mapper.py +++ b/tests/ops/mapper/test_video_remove_watermark_mapper.py @@ -11,7 +11,6 @@ DataJuicerTestCaseBase) -@SKIPPED_TESTS.register_module() class VideoRemoveWatermarkMapperTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/mapper/test_video_split_by_scene_mapper.py b/tests/ops/mapper/test_video_split_by_scene_mapper.py index dbbc32553..6e71789e6 100644 --- a/tests/ops/mapper/test_video_split_by_scene_mapper.py +++ b/tests/ops/mapper/test_video_split_by_scene_mapper.py @@ -8,7 +8,6 @@ from data_juicer.utils.mm_utils import SpecialTokens from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS -@SKIPPED_TESTS.register_module() class VideoSplitBySceneMapperTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/mapper/test_video_tagging_from_frames_mapper.py b/tests/ops/mapper/test_video_tagging_from_frames_mapper.py index b310591a4..4484df754 100644 --- a/tests/ops/mapper/test_video_tagging_from_frames_mapper.py +++ b/tests/ops/mapper/test_video_tagging_from_frames_mapper.py @@ -9,6 +9,8 @@ from data_juicer.utils.mm_utils import SpecialTokens from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS +# Skip tests for this OP in the GitHub actions due to OOM on the current runner +# These tests have been tested locally. @SKIPPED_TESTS.register_module() class VideoTaggingFromFramesMapperTest(DataJuicerTestCaseBase): data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..', diff --git a/tests/ops/test_op_fusion.py b/tests/ops/test_op_fusion.py index d545e0074..271737154 100644 --- a/tests/ops/test_op_fusion.py +++ b/tests/ops/test_op_fusion.py @@ -5,7 +5,6 @@ DataJuicerTestCaseBase) -@SKIPPED_TESTS.register_module() class OpFusionTest(DataJuicerTestCaseBase): def _run_op_fusion(self, original_process_list, target_process_list): diff --git a/tests/run.py b/tests/run.py index 3b7b99736..378d6b0d6 100644 --- a/tests/run.py +++ b/tests/run.py @@ -10,10 +10,11 @@ import os import sys import unittest +import coverage from loguru import logger -from data_juicer.utils.unittest_utils import SKIPPED_TESTS +from data_juicer.utils.unittest_utils import SKIPPED_TESTS, set_clear_model_flag file_dir = os.path.join(os.path.dirname(__file__), '..') sys.path.append(file_dir) @@ -26,8 +27,14 @@ parser.add_argument('--test_dir', default='tests', help='directory to be tested') +parser.add_argument('--clear_model', + default=False, + type=bool, + help='whether to clear the downloaded models for tests. ' + 'It\'s False in default.') args = parser.parse_args() +set_clear_model_flag(args.clear_model) class TaggedTestLoader(unittest.TestLoader): def __init__(self, tag="standalone"): @@ -66,13 +73,21 @@ def gather_test_cases(test_dir, pattern, tag): def main(): + cov = coverage.Coverage() + cov.start() + runner = unittest.TextTestRunner() test_suite = gather_test_cases(os.path.abspath(args.test_dir), args.pattern, args.tag) res = runner.run(test_suite) + + cov.stop() + if not res.wasSuccessful(): exit(1) + cov.report() + if __name__ == '__main__': main()