Skip to content

Commit

Permalink
refine op
Browse files Browse the repository at this point in the history
  • Loading branch information
drcege committed Nov 12, 2024
1 parent 3e8a838 commit ee86360
Show file tree
Hide file tree
Showing 13 changed files with 237 additions and 243 deletions.
9 changes: 4 additions & 5 deletions configs/config_all.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,10 @@ process:
cv_classifier: '' # OpenCV classifier path for face detection. By default, we will use 'haarcascade_frontalface_alt.xml'.
blur_type: 'gaussian' # type of blur kernel, including ['mean', 'box', 'gaussian']
radius: 2 # radius of blur kernel
- image_segment_mapper: # perform segment-anything on images and return the bounding boxes.
imgsz: 1024 # image resolution after image resizing
conf: 0.05 # confidence score threshold
iou: 0.5 # IoU (Intersection over Union) score threshold
- image_tagging_mapper: # Mapper to generate image tags.
tag_field_name: '__dj__image_tags__' # the field name to store the tags. It's "__dj__image_tags__" in default.
- nlpaug_en_mapper: # simply augment texts in English based on the nlpaug library
Expand Down Expand Up @@ -195,11 +199,6 @@ process:
lang: en # sample in which language
tokenization: false # whether to use model to tokenize documents
substrings: ['http', 'www', '.com', 'href', '//'] # incorrect substrings to remove
- segment_mapper: # perform segment-anything on images and return the bounding box values.
fastsam_path: './FastSAM-x.pt' # model name of the FastSAM model on ultralytics
imgsz: 1024 # image resolution after image resizing
conf: 0.05 # confidence score threshold
iou: 0.5 # IoU (Intersection over Union) score threshold
- sentence_split_mapper: # split text to multiple sentences and join them with '\n'
lang: 'en' # split text in what language
- video_captioning_from_audio_mapper: # caption a video according to its audio streams based on Qwen-Audio model
Expand Down
10 changes: 5 additions & 5 deletions data_juicer/ops/mapper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from .image_captioning_mapper import ImageCaptioningMapper
from .image_diffusion_mapper import ImageDiffusionMapper
from .image_face_blur_mapper import ImageFaceBlurMapper
from .image_segment_mapper import ImageSegmentMapper
from .image_tagging_mapper import ImageTaggingMapper
from .nlpaug_en_mapper import NlpaugEnMapper
from .nlpcda_zh_mapper import NlpcdaZhMapper
Expand All @@ -36,7 +37,6 @@
from .remove_words_with_incorrect_substrings_mapper import \
RemoveWordsWithIncorrectSubstringsMapper
from .replace_content_mapper import ReplaceContentMapper
from .segment_mapper import SegmentMapper
from .sentence_split_mapper import SentenceSplitMapper
from .video_captioning_from_audio_mapper import VideoCaptioningFromAudioMapper
from .video_captioning_from_frames_mapper import \
Expand All @@ -63,15 +63,15 @@
'ExpandMacroMapper', 'FixUnicodeMapper', 'GenerateQAFromExamplesMapper',
'GenerateQAFromTextMapper', 'ImageBlurMapper',
'ImageCaptioningFromGPT4VMapper', 'ImageCaptioningMapper',
'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageTaggingMapper',
'NlpaugEnMapper', 'NlpcdaZhMapper', 'OptimizeQAMapper',
'OptimizeQueryMapper', 'OptimizeResponseMapper',
'ImageDiffusionMapper', 'ImageFaceBlurMapper', 'ImageSegmentMapper',
'ImageTaggingMapper', 'NlpaugEnMapper', 'NlpcdaZhMapper',
'OptimizeQAMapper', 'OptimizeQueryMapper', 'OptimizeResponseMapper',
'PunctuationNormalizationMapper', 'RemoveBibliographyMapper',
'RemoveCommentsMapper', 'RemoveHeaderMapper', 'RemoveLongWordsMapper',
'RemoveNonChineseCharacterlMapper', 'RemoveRepeatSentencesMapper',
'RemoveSpecificCharsMapper', 'RemoveTableTextMapper',
'RemoveWordsWithIncorrectSubstringsMapper', 'ReplaceContentMapper',
'SegmentMapper', 'SentenceSplitMapper', 'VideoCaptioningFromAudioMapper',
'SentenceSplitMapper', 'VideoCaptioningFromAudioMapper',
'VideoCaptioningFromFramesMapper', 'VideoCaptioningFromSummarizerMapper',
'VideoCaptioningFromVideoMapper', 'VideoFFmpegWrappedMapper',
'VideoFaceBlurMapper', 'VideoRemoveWatermarkMapper',
Expand Down
70 changes: 70 additions & 0 deletions data_juicer/ops/mapper/image_segment_mapper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
import numpy as np

from data_juicer.utils.constant import Fields
from data_juicer.utils.lazy_loader import LazyLoader
from data_juicer.utils.mm_utils import load_data_with_context, load_image
from data_juicer.utils.model_utils import get_model, prepare_model

from ..base_op import OPERATORS, UNFORKABLE, Mapper
from ..op_fusion import LOADED_IMAGES

OP_NAME = 'image_segment_mapper'

torch = LazyLoader('torch', 'torch')
ultralytics = LazyLoader('ultralytics', 'ultralytics')


@UNFORKABLE.register_module(OP_NAME)
@OPERATORS.register_module(OP_NAME)
@LOADED_IMAGES.register_module(OP_NAME)
class ImageSegmentMapper(Mapper):
"""Perform segment-anything on images and return the bounding boxes."""

_accelerator = 'cuda'

def __init__(self, imgsz=1024, conf=0.05, iou=0.5, *args, **kwargs):
"""
Initialization method.
:param imgsz: resolution for image resizing
:param conf: confidence score threshold
:param iou: IoU (Intersection over Union) score threshold
"""
super().__init__(*args, **kwargs)

self.model_key = prepare_model(model_type='fastsam',
model_path='FastSAM-x.pt')

self.imgsz = imgsz
self.conf = conf
self.iou = iou

def process_single(self, sample, rank=None, context=False):
# there is no image in this sample
if self.image_key not in sample or not sample[self.image_key]:
# N x M x 4 for N images, M boxes, 4 coords
sample[Fields.bbox_tag] = np.empty((0, 0, 4), dtype=np.float32)
return sample

loaded_image_keys = sample[self.image_key]
sample, images = load_data_with_context(sample, context,
loaded_image_keys, load_image)

model = get_model(self.model_key, rank=rank, use_cuda=self.use_cuda())
sample[Fields.bbox_tag] = []

for image in images:
masks = model(image,
retina_masks=True,
imgsz=self.imgsz,
conf=self.conf,
iou=self.iou,
verbose=False)[0]
# breakpoint()
sample[Fields.bbox_tag].append(masks.boxes.xywh.cpu().numpy())

# match schema
if len(sample[Fields.bbox_tag]) == 0:
sample[Fields.bbox_tag] = np.empty((0, 0, 4), dtype=np.float32)
return sample
4 changes: 2 additions & 2 deletions data_juicer/ops/mapper/image_tagging_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,8 +38,8 @@ def __init__(self,
"""
super().__init__(*args, **kwargs)
self.model_key = prepare_model(
model_type='recognizeAnything',
pretrained_model_name_or_path='ram_plus_swin_large_14m.pth',
model_type='ram',
model_path='ram_plus_swin_large_14m.pth',
input_size=384)
self.transform = ram.get_transform(image_size=384)
self.tag_field_name = tag_field_name
Expand Down
87 changes: 0 additions & 87 deletions data_juicer/ops/mapper/segment_mapper.py

This file was deleted.

4 changes: 2 additions & 2 deletions data_juicer/ops/mapper/video_tagging_from_frames_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,8 +61,8 @@ def __init__(self,
f'Frame sampling method [{frame_sampling_method}] is not '
f'supported. Can only be one of ["all_keyframes", "uniform"].')
self.model_key = prepare_model(
model_type='recognizeAnything',
pretrained_model_name_or_path='ram_plus_swin_large_14m.pth',
model_type='ram',
model_path='ram_plus_swin_large_14m.pth',
input_size=384)
self.frame_sampling_method = frame_sampling_method
self.frame_num = frame_num
Expand Down
Loading

0 comments on commit ee86360

Please sign in to comment.