refine op

modelscope · Nov 12, 2024 · ee86360 · ee86360
1 parent 3e8a838
commit ee86360
Show file tree

Hide file tree

Showing 13 changed files with 237 additions and 243 deletions.
diff --git a/configs/config_all.yaml b/configs/config_all.yaml
@@ -132,6 +132,10 @@ process:
       cv_classifier: ''                                       # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
       blur_type: 'gaussian'                                   # type of blur kernel, including ['mean', 'box', 'gaussian']
       radius: 2                                               # radius of blur kernel
+  - image_segment_mapper:                                   # perform segment-anything on images and return the bounding boxes.
+      imgsz: 1024                                             # image resolution after image resizing
+      conf: 0.05                                              # confidence score threshold
+      iou: 0.5                                                # IoU (Intersection over Union) score threshold
   - image_tagging_mapper:                                   # Mapper to generate image tags.
       tag_field_name: '__dj__image_tags__'                    # the field name to store the tags. It's "__dj__image_tags__" in default.
   - nlpaug_en_mapper:                                       # simply augment texts in English based on the nlpaug library
@@ -195,11 +199,6 @@ process:
       lang: en                                                # sample in which language
       tokenization: false                                     # whether to use model to tokenize documents
       substrings: ['http', 'www', '.com', 'href', '//']       # incorrect substrings to remove
-  - segment_mapper:                                         # perform segment-anything on images and return the bounding box values.
-      fastsam_path: './FastSAM-x.pt'                          # model name of the FastSAM model on ultralytics
-      imgsz: 1024                                             # image resolution after image resizing
-      conf: 0.05                                              # confidence score threshold
-      iou: 0.5                                                # IoU (Intersection over Union) score threshold
   - sentence_split_mapper:                                  # split text to multiple sentences and join them with '\n'
       lang: 'en'                                              # split text in what language
   - video_captioning_from_audio_mapper:                     # caption a video according to its audio streams based on Qwen-Audio model

diff --git a/data_juicer/ops/mapper/__init__.py b/data_juicer/ops/mapper/__init__.py
@@ -17,6 +17,7 @@
 from .image_captioning_mapper import ImageCaptioningMapper
 from .image_diffusion_mapper import ImageDiffusionMapper
 from .image_face_blur_mapper import ImageFaceBlurMapper
+from .image_segment_mapper import ImageSegmentMapper
 from .image_tagging_mapper import ImageTaggingMapper
 from .nlpaug_en_mapper import NlpaugEnMapper
 from .nlpcda_zh_mapper import NlpcdaZhMapper
@@ -36,7 +37,6 @@
 from .remove_words_with_incorrect_substrings_mapper import \
     RemoveWordsWithIncorrectSubstringsMapper
 from .replace_content_mapper import ReplaceContentMapper
-from .segment_mapper import SegmentMapper
 from .sentence_split_mapper import SentenceSplitMapper
 from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
 from .video_captioning_from_frames_mapper import \
@@ -63,15 +63,15 @@
     'ExpandMacroMapper', 'FixUnicodeMapper', 'GenerateQAFromExamplesMapper',
     'GenerateQAFromTextMapper', 'ImageBlurMapper',
     'ImageCaptioningFromGPT4VMapper', 'ImageCaptioningMapper',
-    'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageTaggingMapper',
-    'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeQAMapper',
-    'OptimizeQueryMapper', 'OptimizeResponseMapper',
+    'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageSegmentMapper',
+    'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper',
+    'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper',
     'PunctuationNormalizationMapper', 'RemoveBibliographyMapper',
     'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper',
     'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
     'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
     'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
-    'SegmentMapper', 'SentenceSplitMapper', 'VideoCaptioningFromAudioMapper',
+    'SentenceSplitMapper', 'VideoCaptioningFromAudioMapper',
     'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
     'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
     'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',

diff --git a/data_juicer/ops/mapper/image_segment_mapper.py b/data_juicer/ops/mapper/image_segment_mapper.py
@@ -0,0 +1,70 @@
+import numpy as np
+
+from data_juicer.utils.constant import Fields
+from data_juicer.utils.lazy_loader import LazyLoader
+from data_juicer.utils.mm_utils import load_data_with_context, load_image
+from data_juicer.utils.model_utils import get_model, prepare_model
+
+from ..base_op import OPERATORS, UNFORKABLE, Mapper
+from ..op_fusion import LOADED_IMAGES
+
+OP_NAME = 'image_segment_mapper'
+
+torch = LazyLoader('torch', 'torch')
+ultralytics = LazyLoader('ultralytics', 'ultralytics')
+
+
+@UNFORKABLE.register_module(OP_NAME)
+@OPERATORS.register_module(OP_NAME)
+@LOADED_IMAGES.register_module(OP_NAME)
+class ImageSegmentMapper(Mapper):
+    """Perform segment-anything on images and return the bounding boxes."""
+
+    _accelerator = 'cuda'
+
+    def __init__(self, imgsz=1024, conf=0.05, iou=0.5, *args, **kwargs):
+        """
+        Initialization method.
+
+        :param imgsz: resolution for image resizing
+        :param conf: confidence score threshold
+        :param iou: IoU (Intersection over Union) score threshold
+
+        """
+        super().__init__(*args, **kwargs)
+
+        self.model_key = prepare_model(model_type='fastsam',
+                                       model_path='FastSAM-x.pt')
+
+        self.imgsz = imgsz
+        self.conf = conf
+        self.iou = iou
+
+    def process_single(self, sample, rank=None, context=False):
+        # there is no image in this sample
+        if self.image_key not in sample or not sample[self.image_key]:
+            # N x M x 4 for N images, M boxes, 4 coords
+            sample[Fields.bbox_tag] = np.empty((0, 0, 4), dtype=np.float32)
+            return sample
+
+        loaded_image_keys = sample[self.image_key]
+        sample, images = load_data_with_context(sample, context,
+                                                loaded_image_keys, load_image)
+
+        model = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda())
+        sample[Fields.bbox_tag] = []
+
+        for image in images:
+            masks = model(image,
+                          retina_masks=True,
+                          imgsz=self.imgsz,
+                          conf=self.conf,
+                          iou=self.iou,
+                          verbose=False)[0]
+            # breakpoint()
+            sample[Fields.bbox_tag].append(masks.boxes.xywh.cpu().numpy())
+
+        # match schema
+        if len(sample[Fields.bbox_tag]) == 0:
+            sample[Fields.bbox_tag] = np.empty((0, 0, 4), dtype=np.float32)
+        return sample
diff --git a/data_juicer/ops/mapper/image_tagging_mapper.py b/data_juicer/ops/mapper/image_tagging_mapper.py
@@ -38,8 +38,8 @@ def __init__(self,
         """
         super().__init__(*args, **kwargs)
         self.model_key = prepare_model(
-            model_type='recognizeAnything',
-            pretrained_model_name_or_path='ram_plus_swin_large_14m.pth',
+            model_type='ram',
+            model_path='ram_plus_swin_large_14m.pth',
             input_size=384)
         self.transform = ram.get_transform(image_size=384)
         self.tag_field_name = tag_field_name

diff --git a/data_juicer/ops/mapper/segment_mapper.py b/data_juicer/ops/mapper/segment_mapper.py
diff --git a/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py b/data_juicer/ops/mapper/video_tagging_from_frames_mapper.py
@@ -61,8 +61,8 @@ def __init__(self,
                 f'Frame sampling method [{frame_sampling_method}] is not '
                 f'supported. Can only be one of ["all_keyframes", "uniform"].')
         self.model_key = prepare_model(
-            model_type='recognizeAnything',
-            pretrained_model_name_or_path='ram_plus_swin_large_14m.pth',
+            model_type='ram',
+            model_path='ram_plus_swin_large_14m.pth',
             input_size=384)
         self.frame_sampling_method = frame_sampling_method
         self.frame_num = frame_num