Skip to content

Commit

Permalink
unittest opt (#460)
Browse files Browse the repository at this point in the history
* * use coverage to check the unittest coverage

* * ignore trace for unknown strange file config-3.py

* * ignore trace for unknown strange file config-3.py

* * ignore third-party packages and entrance of unittest

* * ignore code of unittest

* * keep the downloaded models during unittest in default and make it customizable

* * disable all skipping on some model-based unittest

* * enable unittest

* * skip initialization models when preparing vllm models

* * fix bug in remove_table_text_mapper from batched modification

* * set mem_required for tests of image_captioning_mapper

* * skip tests that might be OOM

* * skip 3 vllm-related OPs and 1 OOM OP

* - avoid two tests
  • Loading branch information
HYLcool authored Nov 1, 2024
1 parent 9e20e6a commit d185d54
Show file tree
Hide file tree
Showing 34 changed files with 65 additions and 30 deletions.
11 changes: 11 additions & 0 deletions .coveragerc
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
[run]
omit =
# avoid measuring strange non-existing files
/workspace/config.py
/workspace/config-3.py

# avoid measuring third-party dist packages
*/dist-packages/*

# avoid measuring code of unittest
tests/*
4 changes: 2 additions & 2 deletions data_juicer/ops/mapper/remove_table_text_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,8 @@ def __init__(self,

def process_batched(self, samples):
for idx, text in enumerate(samples[self.text_key]):
for idx in range(self.min_col - 1, self.max_col):
pattern = re.compile(self.pattern % idx)
for i in range(self.min_col - 1, self.max_col):
pattern = re.compile(self.pattern % i)
text = pattern.sub('', text)

samples[self.text_key][idx] = text
Expand Down
3 changes: 2 additions & 1 deletion data_juicer/utils/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,7 +612,8 @@ def prepare_model(model_type, **model_kwargs):
model_func = MODEL_FUNCTION_MAPPING[model_type]
model_key = partial(model_func, **model_kwargs)
# always instantiate once for possible caching
model_key()
if model_type != 'vllm':
model_key()
return model_key


Expand Down
15 changes: 14 additions & 1 deletion data_juicer/utils/unittest_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@

SKIPPED_TESTS = Registry('SkippedTests')

CLEAR_MODEL = False


def TEST_TAG(*tags):
"""Tags for test case.
Expand All @@ -29,6 +31,15 @@ def decorator(func):
return decorator


def set_clear_model_flag(flag):
global CLEAR_MODEL
CLEAR_MODEL = flag
if CLEAR_MODEL:
print('CLEAR DOWNLOADED MODELS AFTER UNITTESTS.')
else:
print('KEEP DOWNLOADED MODELS AFTER UNITTESTS.')


class DataJuicerTestCaseBase(unittest.TestCase):

@classmethod
Expand All @@ -48,7 +59,9 @@ def tearDownClass(cls, hf_model_name=None) -> None:
multiprocess.set_start_method(cls.original_mp_method, force=True)

# clean the huggingface model cache files
if hf_model_name:
if not CLEAR_MODEL:
pass
elif hf_model_name:
# given the hf model name, remove this model only
model_dir = os.path.join(
transformers.TRANSFORMERS_CACHE,
Expand Down
1 change: 1 addition & 0 deletions environments/dev_requires.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
coverage
pre-commit
sphinx
sphinx-autobuild
Expand Down
2 changes: 2 additions & 0 deletions tests/core/test_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
from data_juicer.core import Monitor
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

# Skip this test due to some random resource utilization fluctuation, which may
# cause failure of this test
@SKIPPED_TESTS.register_module()
class MonitorTest(DataJuicerTestCaseBase):

Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_image_aesthetics_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class ImageAestheticsFilterTest(DataJuicerTestCaseBase):

maxDiff = None
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_image_face_count_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS


@SKIPPED_TESTS.register_module()
class ImageFaceCountFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_image_face_ratio_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS


@SKIPPED_TESTS.register_module()
class ImageFaceRatioFilterTest(DataJuicerTestCaseBase):

maxDiff = None
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_image_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class ImageNSFWFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_image_text_matching_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class ImageTextMatchingFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_image_watermark_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class ImageWatermarkFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_phrase_grounding_recall_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class PhraseGroundingRecallFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_video_aesthetics_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class VideoAestheticsFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_video_nsfw_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class VideoNSFWFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_video_ocr_area_ratio_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class VideoOcrAreaRatioFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_video_tagging_from_frames_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class VideoTaggingFromFramesFilterTest(DataJuicerTestCaseBase):
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
'data')
Expand Down
1 change: 0 additions & 1 deletion tests/ops/filter/test_video_watermark_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class VideoWatermarkFilterTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
2 changes: 1 addition & 1 deletion tests/ops/mapper/test_extract_qa_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
DataJuicerTestCaseBase)

# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to unknown DistNetworkError.
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class ExtractQAMapperTest(DataJuicerTestCaseBase):
Expand Down
2 changes: 1 addition & 1 deletion tests/ops/mapper/test_generate_instruction_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
DataJuicerTestCaseBase)

# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to unknown DistNetworkError.
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class GenerateInstructionMapperTest(DataJuicerTestCaseBase):
Expand Down
2 changes: 1 addition & 1 deletion tests/ops/mapper/test_image_captioning_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
DataJuicerTestCaseBase)


# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to OOM on the current runner
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class ImageCaptioningMapperTest(DataJuicerTestCaseBase):
Expand Down
2 changes: 1 addition & 1 deletion tests/ops/mapper/test_image_diffusion_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
DataJuicerTestCaseBase)


# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to OOM on the current runner
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class ImageDiffusionMapperTest(DataJuicerTestCaseBase):
Expand Down
2 changes: 2 additions & 0 deletions tests/ops/mapper/test_image_tagging_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
from data_juicer.utils.constant import Fields
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

# Skip tests for this OP in the GitHub actions due to OOM on the current runner
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class ImageTaggingMapperTest(DataJuicerTestCaseBase):
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
5 changes: 4 additions & 1 deletion tests/ops/mapper/test_nlpcda_zh_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,12 @@

from data_juicer.core import NestedDataset as Dataset
from data_juicer.ops.mapper.nlpcda_zh_mapper import NlpcdaZhMapper
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS


# Skip tests for this OP in the GitHub actions due to unknown UnicodeEncodeError
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class NlpaugEnMapperTest(DataJuicerTestCaseBase):

def setUp(self):
Expand Down
2 changes: 1 addition & 1 deletion tests/ops/mapper/test_optimize_instruction_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from data_juicer.utils.unittest_utils import (SKIPPED_TESTS,
DataJuicerTestCaseBase)

# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to unknown DistNetworkError.
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class OptimizeInstructionMapperTest(DataJuicerTestCaseBase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
DataJuicerTestCaseBase)


# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to OOM on the current runner
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class VideoCaptioningFromAudioMapperTest(DataJuicerTestCaseBase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
DataJuicerTestCaseBase)


# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to OOM on the current runner
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class VideoCaptioningFromFramesMapperTest(DataJuicerTestCaseBase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
DataJuicerTestCaseBase)


# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to OOM on the current runner
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class VideoCaptioningFromSummarizerMapperTest(DataJuicerTestCaseBase):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
DataJuicerTestCaseBase)


# Skip tests for this OP in the GitHub actions due to disk space limitation.
# Skip tests for this OP in the GitHub actions due to OOM on the current runner
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class VideoCaptioningFromVideoMapperTest(DataJuicerTestCaseBase):
Expand Down
1 change: 0 additions & 1 deletion tests/ops/mapper/test_video_remove_watermark_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
DataJuicerTestCaseBase)


@SKIPPED_TESTS.register_module()
class VideoRemoveWatermarkMapperTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/mapper/test_video_split_by_scene_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

@SKIPPED_TESTS.register_module()
class VideoSplitBySceneMapperTest(DataJuicerTestCaseBase):

data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
2 changes: 2 additions & 0 deletions tests/ops/mapper/test_video_tagging_from_frames_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from data_juicer.utils.mm_utils import SpecialTokens
from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase, SKIPPED_TESTS

# Skip tests for this OP in the GitHub actions due to OOM on the current runner
# These tests have been tested locally.
@SKIPPED_TESTS.register_module()
class VideoTaggingFromFramesMapperTest(DataJuicerTestCaseBase):
data_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..',
Expand Down
1 change: 0 additions & 1 deletion tests/ops/test_op_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
DataJuicerTestCaseBase)


@SKIPPED_TESTS.register_module()
class OpFusionTest(DataJuicerTestCaseBase):

def _run_op_fusion(self, original_process_list, target_process_list):
Expand Down
17 changes: 16 additions & 1 deletion tests/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,11 @@
import os
import sys
import unittest
import coverage

from loguru import logger

from data_juicer.utils.unittest_utils import SKIPPED_TESTS
from data_juicer.utils.unittest_utils import SKIPPED_TESTS, set_clear_model_flag

file_dir = os.path.join(os.path.dirname(__file__), '..')
sys.path.append(file_dir)
Expand All @@ -26,8 +27,14 @@
parser.add_argument('--test_dir',
default='tests',
help='directory to be tested')
parser.add_argument('--clear_model',
default=False,
type=bool,
help='whether to clear the downloaded models for tests. '
'It\'s False in default.')
args = parser.parse_args()

set_clear_model_flag(args.clear_model)

class TaggedTestLoader(unittest.TestLoader):
def __init__(self, tag="standalone"):
Expand Down Expand Up @@ -66,13 +73,21 @@ def gather_test_cases(test_dir, pattern, tag):


def main():
cov = coverage.Coverage()
cov.start()

runner = unittest.TextTestRunner()
test_suite = gather_test_cases(os.path.abspath(args.test_dir),
args.pattern, args.tag)
res = runner.run(test_suite)

cov.stop()

if not res.wasSuccessful():
exit(1)

cov.report()


if __name__ == '__main__':
main()

0 comments on commit d185d54

Please sign in to comment.