diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html index 842d016c3..86b88ec8c 100644 --- a/_modules/data_juicer/ops/base_op.html +++ b/_modules/data_juicer/ops/base_op.html @@ -144,6 +144,7 @@

Source code for data_juicer.ops.base_op

             logger.error(
                 f'An error occurred in mapper operation when processing '
                 f'samples {samples}, {type(e)}: {e}')
+            traceback.print_exc()
             ret = {key: [] for key in samples.keys()}
             ret[Fields.stats] = []
             ret[Fields.source_file] = []
@@ -181,6 +182,7 @@ 

Source code for data_juicer.ops.base_op

                 logger.error(
                     f'An error occurred in mapper operation when processing '
                     f'sample {sample}, {type(e)}: {e}')
+                traceback.print_exc()
                 ret = {key: [] for key in sample.keys()}
                 ret[Fields.stats] = []
                 ret[Fields.source_file] = []
diff --git a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
index 14cd0401c..76113ded6 100644
--- a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
@@ -86,7 +86,8 @@ 

Source code for data_juicer.ops.deduplicator.ray_video_deduplicator

from jsonargparse.typing import PositiveInt -from data_juicer.utils.mm_utils import load_data_with_context, load_video +from data_juicer.utils.mm_utils import (close_video, load_data_with_context, + load_video) from ..base_op import OPERATORS from ..op_fusion import LOADED_VIDEOS @@ -136,6 +137,9 @@

Source code for data_juicer.ops.deduplicator.ray_video_deduplicator

if packet.stream.type == 'video': md5_hash.update(bytes(packet)) + for key in videos: + close_video(videos[key]) + return md5_hash.hexdigest()
diff --git a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html index 079719912..a250ba39f 100644 --- a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html +++ b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html @@ -87,7 +87,8 @@

Source code for data_juicer.ops.deduplicator.video_deduplicator

from typing import Dict, Set, Tuple from data_juicer.utils.constant import HashKeys -from data_juicer.utils.mm_utils import load_data_with_context, load_video +from data_juicer.utils.mm_utils import (close_video, load_data_with_context, + load_video) from ..base_op import OPERATORS, Deduplicator from ..op_fusion import LOADED_VIDEOS @@ -145,6 +146,9 @@

Source code for data_juicer.ops.deduplicator.video_deduplicator

if packet.stream.type == 'video': md5_hash.update(bytes(packet)) + for key in videos: + close_video(videos[key]) + sample[HashKeys.videohash] = md5_hash.hexdigest() return sample
diff --git a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html index 60d935d11..e49760577 100644 --- a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html +++ b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html @@ -88,7 +88,7 @@

Source code for data_juicer.ops.filter.video_aesthetics_filter

from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.mm_utils import (extract_key_frames, +from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) @@ -265,7 +265,7 @@

Source code for data_juicer.ops.filter.video_aesthetics_filter

if not context: for vid_key in videos: - videos[vid_key].close() + close_video(videos[vid_key]) return sample
diff --git a/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html b/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html index 19d9a9638..66b218024 100644 --- a/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html +++ b/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html @@ -87,7 +87,8 @@

Source code for data_juicer.ops.filter.video_aspect_ratio_filter

import numpy as np from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.mm_utils import load_data_with_context, load_video +from data_juicer.utils.mm_utils import (close_video, load_data_with_context, + load_video) from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_VIDEOS @@ -151,7 +152,7 @@

Source code for data_juicer.ops.filter.video_aspect_ratio_filter

video_aspect_ratios[ key] = stream.codec_context.width / stream.codec_context.height if not context: - video.close() + close_video(video) sample[Fields.stats][StatsKeys.video_aspect_ratios] = [ video_aspect_ratios[key] for key in loaded_video_keys diff --git a/_modules/data_juicer/ops/filter/video_duration_filter.html b/_modules/data_juicer/ops/filter/video_duration_filter.html index 58798175f..61458e793 100644 --- a/_modules/data_juicer/ops/filter/video_duration_filter.html +++ b/_modules/data_juicer/ops/filter/video_duration_filter.html @@ -88,7 +88,8 @@

Source code for data_juicer.ops.filter.video_duration_filter

from jsonargparse.typing import NonNegativeInt from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.mm_utils import load_data_with_context, load_video +from data_juicer.utils.mm_utils import (close_video, load_data_with_context, + load_video) from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_VIDEOS @@ -152,7 +153,7 @@

Source code for data_juicer.ops.filter.video_duration_filter

video_durations[video_key] = round(stream.duration * stream.time_base) if not context: - video.close() + close_video(video) # get video durations sample[Fields.stats][StatsKeys.video_duration] = [ diff --git a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html index a4c4a3664..9e4641138 100644 --- a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html +++ b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html @@ -88,7 +88,8 @@

Source code for data_juicer.ops.filter.video_frames_text_similarity_filterfrom data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.mm_utils import (SpecialTokens, extract_key_frames, +from data_juicer.utils.mm_utils import (SpecialTokens, close_video, + extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video, remove_special_tokens) @@ -279,7 +280,7 @@

Source code for data_juicer.ops.filter.video_frames_text_similarity_filterif not context: for vid_key in videos: - videos[vid_key].close() + close_video(videos[vid_key]) return sample

diff --git a/_modules/data_juicer/ops/filter/video_nsfw_filter.html b/_modules/data_juicer/ops/filter/video_nsfw_filter.html index 646697769..81ab4494b 100644 --- a/_modules/data_juicer/ops/filter/video_nsfw_filter.html +++ b/_modules/data_juicer/ops/filter/video_nsfw_filter.html @@ -87,7 +87,7 @@

Source code for data_juicer.ops.filter.video_nsfw_filter

from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.mm_utils import (extract_key_frames, +from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model @@ -240,7 +240,7 @@

Source code for data_juicer.ops.filter.video_nsfw_filter

if not context: for vid_key in videos: - videos[vid_key].close() + close_video(videos[vid_key]) return sample
diff --git a/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html b/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html index 10db9b6ed..3000f02fa 100644 --- a/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html +++ b/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html @@ -90,7 +90,8 @@

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

from data_juicer import cuda_device_count from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.mm_utils import (extract_video_frames_uniformly, +from data_juicer.utils.mm_utils import (close_video, + extract_video_frames_uniformly, load_data_with_context, load_video) from ..base_op import OPERATORS, UNFORKABLE, Filter @@ -255,7 +256,7 @@

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

video_ocr_area_ratios[video_key] = np.mean(frame_ocr_area_ratios) if not context: - container.close() + close_video(container) # get video durations sample[Fields.stats][StatsKeys.video_ocr_area_ratio] = [ diff --git a/_modules/data_juicer/ops/filter/video_resolution_filter.html b/_modules/data_juicer/ops/filter/video_resolution_filter.html index df7875f6d..475dca8a6 100644 --- a/_modules/data_juicer/ops/filter/video_resolution_filter.html +++ b/_modules/data_juicer/ops/filter/video_resolution_filter.html @@ -88,7 +88,8 @@

Source code for data_juicer.ops.filter.video_resolution_filter

from jsonargparse.typing import PositiveInt from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.mm_utils import load_data_with_context, load_video +from data_juicer.utils.mm_utils import (close_video, load_data_with_context, + load_video) from ..base_op import OPERATORS, Filter from ..op_fusion import LOADED_VIDEOS @@ -175,7 +176,7 @@

Source code for data_juicer.ops.filter.video_resolution_filter

if not context: for vid_key in videos: - videos[vid_key].close() + close_video(videos[vid_key]) return sample
diff --git a/_modules/data_juicer/ops/filter/video_watermark_filter.html b/_modules/data_juicer/ops/filter/video_watermark_filter.html index 84a31bcf8..c92ced165 100644 --- a/_modules/data_juicer/ops/filter/video_watermark_filter.html +++ b/_modules/data_juicer/ops/filter/video_watermark_filter.html @@ -87,7 +87,7 @@

Source code for data_juicer.ops.filter.video_watermark_filter

from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields, StatsKeys -from data_juicer.utils.mm_utils import (extract_key_frames, +from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model @@ -241,7 +241,7 @@

Source code for data_juicer.ops.filter.video_watermark_filter

if not context: for vid_key in videos: - videos[vid_key].close() + close_video(videos[vid_key]) return sample
diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html index fd190d1e5..c3ea1eb1e 100644 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html @@ -82,7 +82,8 @@

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapper

-import copy
+# yapf: disable
+import copy
 import random
 
 import numpy as np
@@ -92,7 +93,8 @@ 

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapperfrom data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys -from data_juicer.utils.mm_utils import (SpecialTokens, extract_key_frames, +from data_juicer.utils.mm_utils import (SpecialTokens, close_video, + extract_key_frames, extract_video_frames_uniformly, insert_texts_after_placeholders, load_data_with_context, load_video, @@ -369,7 +371,7 @@

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapperif not context: for vid_key in videos: - videos[vid_key].close() + close_video(videos[vid_key]) return generated_samples def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html index 628cde450..eafafa8c1 100644 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html @@ -82,7 +82,8 @@

Source code for data_juicer.ops.mapper.video_captioning_from_video_mapper

-import copy
+# yapf: disable
+import copy
 import random
 
 import numpy as np
@@ -92,7 +93,8 @@ 

Source code for data_juicer.ops.mapper.video_captioning_from_video_mapperfrom data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import HashKeys -from data_juicer.utils.mm_utils import (SpecialTokens, extract_key_frames, +from data_juicer.utils.mm_utils import (SpecialTokens, close_video, + extract_key_frames, extract_video_frames_uniformly, insert_texts_after_placeholders, load_data_with_context, load_video, @@ -376,7 +378,7 @@

Source code for data_juicer.ops.mapper.video_captioning_from_video_mapperif not context: for vid_key in videos: - videos[vid_key].close() + close_video(videos[vid_key]) return generated_samples def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): diff --git a/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html b/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html index 501c3a0c9..e2cb0392b 100644 --- a/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html @@ -87,8 +87,9 @@

Source code for data_juicer.ops.mapper.video_face_blur_mapper

from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename -from data_juicer.utils.mm_utils import (load_data_with_context, load_video, - pil_to_opencv, process_each_frame) +from data_juicer.utils.mm_utils import (close_video, load_data_with_context, + load_video, pil_to_opencv, + process_each_frame) from ..base_op import OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS @@ -177,7 +178,7 @@

Source code for data_juicer.ops.mapper.video_face_blur_mapper

processed_video_keys[video_key] = output_video_key if not context: - video.close() + close_video(video) # when the file is modified, its source file needs to be updated. for i, value in enumerate(loaded_video_keys): diff --git a/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html b/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html index 7c23117e9..b8cbf5a59 100644 --- a/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html @@ -92,7 +92,8 @@

Source code for data_juicer.ops.mapper.video_remove_watermark_mapper

from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints -from data_juicer.utils.mm_utils import (extract_video_frames_uniformly, +from data_juicer.utils.mm_utils import (close_video, + extract_video_frames_uniformly, load_data_with_context, load_video, parse_string_to_roi, process_each_frame) @@ -317,7 +318,7 @@

Source code for data_juicer.ops.mapper.video_remove_watermark_mapper

if not context: for vid_key in videos: - videos[vid_key].close() + close_video(videos[vid_key]) # when the file is modified, its source file needs to be updated. for i, value in enumerate(sample[self.video_key]): diff --git a/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html b/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html index c2173a567..852809345 100644 --- a/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html @@ -90,7 +90,7 @@

Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper

from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints -from data_juicer.utils.mm_utils import load_video +from data_juicer.utils.mm_utils import close_video, load_video from ..base_op import OPERATORS, Mapper @@ -201,7 +201,7 @@

Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper

original_width = video.codec_context.width original_height = video.codec_context.height original_aspect_ratio = Fraction(original_width, original_height) - container.close() + close_video(container) if (original_aspect_ratio >= self.min_ratio and original_aspect_ratio <= self.max_ratio): diff --git a/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html b/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html index 572c9bbd8..24dc4cbaa 100644 --- a/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html @@ -92,7 +92,7 @@

Source code for data_juicer.ops.mapper.video_resize_resolution_mapper

from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import transfer_filename from data_juicer.utils.logger_utils import HiddenPrints -from data_juicer.utils.mm_utils import load_video +from data_juicer.utils.mm_utils import close_video, load_video from ..base_op import OPERATORS, Mapper from ..op_fusion import LOADED_VIDEOS @@ -186,7 +186,7 @@

Source code for data_juicer.ops.mapper.video_resize_resolution_mapper

width = video.codec_context.width height = video.codec_context.height origin_ratio = width / height - container.close() + close_video(container) if width >= self.min_width and width <= self.max_width and \ height >= self.min_height and height <= self.max_height: diff --git a/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html index d36b28733..a274db820 100644 --- a/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html @@ -90,7 +90,8 @@

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import (add_suffix_to_filename, transfer_filename) -from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds, +from data_juicer.utils.mm_utils import (SpecialTokens, close_video, + cut_video_by_seconds, get_video_duration, load_video) from ..base_op import OPERATORS, Mapper @@ -207,7 +208,7 @@

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

video = videos[video_key] new_video_keys = self.split_videos_by_duration( video_key, video) - video.close() + close_video(video) split_video_keys.extend(new_video_keys) place_holders.append(SpecialTokens.video * len(new_video_keys)) diff --git a/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html index e5e2de283..7ff4ac7cc 100644 --- a/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html @@ -88,7 +88,8 @@

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

< from data_juicer.utils.constant import Fields from data_juicer.utils.file_utils import (add_suffix_to_filename, transfer_filename) -from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds, +from data_juicer.utils.mm_utils import (SpecialTokens, close_video, + cut_video_by_seconds, get_key_frame_seconds, load_video) from ..base_op import OPERATORS, Mapper @@ -189,7 +190,7 @@

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

< video_count]: video = videos[video_key] new_video_keys = self.get_split_key_frame(video_key, video) - video.close() + close_video(video) split_video_keys.extend(new_video_keys) place_holders.append(SpecialTokens.video * len(new_video_keys)) diff --git a/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html b/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html index a78d9a2e9..f13b844bf 100644 --- a/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html +++ b/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html @@ -88,7 +88,7 @@

Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper

from data_juicer.utils.availability_utils import AvailabilityChecking from data_juicer.utils.constant import Fields -from data_juicer.utils.mm_utils import (extract_key_frames, +from data_juicer.utils.mm_utils import (close_video, extract_key_frames, extract_video_frames_uniformly, load_data_with_context, load_video) from data_juicer.utils.model_utils import get_model, prepare_model @@ -194,6 +194,11 @@

Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper

word_count = Counter(words) sorted_word_list = [item for item, _ in word_count.most_common()] video_tags.append(sorted_word_list) + + if not context: + for vid_key in videos: + close_video(videos[vid_key]) + sample[Fields.video_frame_tags] = video_tags return sample