diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html
index 842d016c3..86b88ec8c 100644
--- a/_modules/data_juicer/ops/base_op.html
+++ b/_modules/data_juicer/ops/base_op.html
@@ -144,6 +144,7 @@
logger.error(
f'An error occurred in mapper operation when processing '
f'samples {samples}, {type(e)}: {e}')
+ traceback.print_exc()
ret = {key: [] for key in samples.keys()}
ret[Fields.stats] = []
ret[Fields.source_file] = []
@@ -181,6 +182,7 @@ Source code for data_juicer.ops.base_op
logger.error(
f'An error occurred in mapper operation when processing '
f'sample {sample}, {type(e)}: {e}')
+ traceback.print_exc()
ret = {key: [] for key in sample.keys()}
ret[Fields.stats] = []
ret[Fields.source_file] = []
diff --git a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
index 14cd0401c..76113ded6 100644
--- a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html
@@ -86,7 +86,8 @@ Source code for data_juicer.ops.deduplicator.ray_video_deduplicator
from
jsonargparse.typing import PositiveInt
-
from data_juicer.utils.mm_utils import load_data_with_context, load_video
+
from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
+
load_video)
from ..base_op import OPERATORS
from ..op_fusion import LOADED_VIDEOS
@@ -136,6 +137,9 @@
Source code for data_juicer.ops.deduplicator.ray_video_deduplicator
if packet.stream.type == 'video':
md5_hash.update(bytes(packet))
+ for key in videos:
+ close_video(videos[key])
+
return md5_hash.hexdigest()
diff --git a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html
index 079719912..a250ba39f 100644
--- a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html
+++ b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html
@@ -87,7 +87,8 @@ Source code for data_juicer.ops.deduplicator.video_deduplicator
from
typing import Dict, Set, Tuple
from data_juicer.utils.constant import HashKeys
-
from data_juicer.utils.mm_utils import load_data_with_context, load_video
+
from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
+
load_video)
from ..base_op import OPERATORS, Deduplicator
from ..op_fusion import LOADED_VIDEOS
@@ -145,6 +146,9 @@
Source code for data_juicer.ops.deduplicator.video_deduplicator
if packet.stream.type == 'video':
md5_hash.update(bytes(packet))
+ for key in videos:
+ close_video(videos[key])
+
sample[HashKeys.videohash] = md5_hash.hexdigest()
return sample
diff --git a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html
index 60d935d11..e49760577 100644
--- a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html
+++ b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html
@@ -88,7 +88,7 @@
Source code for data_juicer.ops.filter.video_aesthetics_filter
from
data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
-
from data_juicer.utils.mm_utils import (extract_key_frames,
+
from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
extract_video_frames_uniformly,
load_data_with_context, load_video)
@@ -265,7 +265,7 @@
Source code for data_juicer.ops.filter.video_aesthetics_filter
if not context:
for vid_key in videos:
- videos[vid_key].close()
+ close_video(videos[vid_key])
return sample
diff --git a/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html b/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html
index 19d9a9638..66b218024 100644
--- a/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html
+++ b/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html
@@ -87,7 +87,8 @@
Source code for data_juicer.ops.filter.video_aspect_ratio_filter
import
numpy as np
from data_juicer.utils.constant import Fields, StatsKeys
-
from data_juicer.utils.mm_utils import load_data_with_context, load_video
+
from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
+
load_video)
from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_VIDEOS
@@ -151,7 +152,7 @@
Source code for data_juicer.ops.filter.video_aspect_ratio_filter
video_aspect_ratios
[
key] = stream.codec_context.width / stream.codec_context.height
if not context:
-
video.close()
+
close_video(video)
sample[Fields.stats][StatsKeys.video_aspect_ratios] = [
video_aspect_ratios[key] for key in loaded_video_keys
diff --git a/_modules/data_juicer/ops/filter/video_duration_filter.html b/_modules/data_juicer/ops/filter/video_duration_filter.html
index 58798175f..61458e793 100644
--- a/_modules/data_juicer/ops/filter/video_duration_filter.html
+++ b/_modules/data_juicer/ops/filter/video_duration_filter.html
@@ -88,7 +88,8 @@
Source code for data_juicer.ops.filter.video_duration_filter
from
jsonargparse.typing import NonNegativeInt
from data_juicer.utils.constant import Fields, StatsKeys
-
from data_juicer.utils.mm_utils import load_data_with_context, load_video
+
from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
+
load_video)
from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_VIDEOS
@@ -152,7 +153,7 @@
Source code for data_juicer.ops.filter.video_duration_filter
video_durations[video_key] = round(stream.duration *
stream.time_base)
if not context:
- video.close()
+ close_video(video)
# get video durations
sample[Fields.stats][StatsKeys.video_duration] = [
diff --git a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html
index a4c4a3664..9e4641138 100644
--- a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html
+++ b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html
@@ -88,7 +88,8 @@
Source code for data_juicer.ops.filter.video_frames_text_similarity_filter
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import (SpecialTokens, extract_key_frames,
+from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
+ extract_key_frames,
extract_video_frames_uniformly,
load_data_with_context, load_video,
remove_special_tokens)
@@ -279,7 +280,7 @@ Source code for data_juicer.ops.filter.video_frames_text_similarity_filter
if not context:
for vid_key in videos:
- videos[vid_key].close()
+ close_video(videos[vid_key])
return sample
diff --git a/_modules/data_juicer/ops/filter/video_nsfw_filter.html b/_modules/data_juicer/ops/filter/video_nsfw_filter.html
index 646697769..81ab4494b 100644
--- a/_modules/data_juicer/ops/filter/video_nsfw_filter.html
+++ b/_modules/data_juicer/ops/filter/video_nsfw_filter.html
@@ -87,7 +87,7 @@
Source code for data_juicer.ops.filter.video_nsfw_filter
from
data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
-
from data_juicer.utils.mm_utils import (extract_key_frames,
+
from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
extract_video_frames_uniformly,
load_data_with_context, load_video)
from data_juicer.utils.model_utils import get_model, prepare_model
@@ -240,7 +240,7 @@
Source code for data_juicer.ops.filter.video_nsfw_filter
if not context:
for vid_key in videos:
- videos[vid_key].close()
+ close_video(videos[vid_key])
return sample
diff --git a/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html b/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html
index 10db9b6ed..3000f02fa 100644
--- a/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html
+++ b/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html
@@ -90,7 +90,8 @@
Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter
from
data_juicer import cuda_device_count
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
-
from data_juicer.utils.mm_utils import (extract_video_frames_uniformly,
+
from data_juicer.utils.mm_utils import (close_video,
+
extract_video_frames_uniformly,
load_data_with_context, load_video)
from ..base_op import OPERATORS, UNFORKABLE, Filter
@@ -255,7 +256,7 @@
Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter
video_ocr_area_ratios
[video_key] = np.mean(frame_ocr_area_ratios)
if not context:
-
container.close()
+
close_video(container)
# get video durations
sample[Fields.stats][StatsKeys.video_ocr_area_ratio] = [
diff --git a/_modules/data_juicer/ops/filter/video_resolution_filter.html b/_modules/data_juicer/ops/filter/video_resolution_filter.html
index df7875f6d..475dca8a6 100644
--- a/_modules/data_juicer/ops/filter/video_resolution_filter.html
+++ b/_modules/data_juicer/ops/filter/video_resolution_filter.html
@@ -88,7 +88,8 @@
Source code for data_juicer.ops.filter.video_resolution_filter
from
jsonargparse.typing import PositiveInt
from data_juicer.utils.constant import Fields, StatsKeys
-
from data_juicer.utils.mm_utils import load_data_with_context, load_video
+
from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
+
load_video)
from ..base_op import OPERATORS, Filter
from ..op_fusion import LOADED_VIDEOS
@@ -175,7 +176,7 @@
Source code for data_juicer.ops.filter.video_resolution_filter
if not context:
for vid_key in videos:
- videos[vid_key].close()
+ close_video(videos[vid_key])
return sample
diff --git a/_modules/data_juicer/ops/filter/video_watermark_filter.html b/_modules/data_juicer/ops/filter/video_watermark_filter.html
index 84a31bcf8..c92ced165 100644
--- a/_modules/data_juicer/ops/filter/video_watermark_filter.html
+++ b/_modules/data_juicer/ops/filter/video_watermark_filter.html
@@ -87,7 +87,7 @@
Source code for data_juicer.ops.filter.video_watermark_filter
from
data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields, StatsKeys
-
from data_juicer.utils.mm_utils import (extract_key_frames,
+
from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
extract_video_frames_uniformly,
load_data_with_context, load_video)
from data_juicer.utils.model_utils import get_model, prepare_model
@@ -241,7 +241,7 @@
Source code for data_juicer.ops.filter.video_watermark_filter
if not context:
for vid_key in videos:
- videos[vid_key].close()
+ close_video(videos[vid_key])
return sample
diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html
index fd190d1e5..c3ea1eb1e 100644
--- a/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html
@@ -82,7 +82,8 @@
Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapper
-import copy
+# yapf: disable
+import copy
import random
import numpy as np
@@ -92,7 +93,8 @@ Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapper
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.mm_utils import (SpecialTokens, extract_key_frames,
+from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
+ extract_key_frames,
extract_video_frames_uniformly,
insert_texts_after_placeholders,
load_data_with_context, load_video,
@@ -369,7 +371,7 @@ Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapper
if not context:
for vid_key in videos:
- videos[vid_key].close()
+ close_video(videos[vid_key])
return generated_samples
def _reduce_captions(self, chunk, generated_text_candidates_single_chunk):
diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html
index 628cde450..eafafa8c1 100644
--- a/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html
@@ -82,7 +82,8 @@
Source code for data_juicer.ops.mapper.video_captioning_from_video_mapper
-import copy
+# yapf: disable
+import copy
import random
import numpy as np
@@ -92,7 +93,8 @@ Source code for data_juicer.ops.mapper.video_captioning_from_video_mapperfrom data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.mm_utils import (SpecialTokens, extract_key_frames,
+from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
+ extract_key_frames,
extract_video_frames_uniformly,
insert_texts_after_placeholders,
load_data_with_context, load_video,
@@ -376,7 +378,7 @@ Source code for data_juicer.ops.mapper.video_captioning_from_video_mapperif not context:
for vid_key in videos:
- videos[vid_key].close()
+ close_video(videos[vid_key])
return generated_samples
def _reduce_captions(self, chunk, generated_text_candidates_single_chunk):
diff --git a/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html b/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html
index 501c3a0c9..e2cb0392b 100644
--- a/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html
@@ -87,8 +87,9 @@ Source code for data_juicer.ops.mapper.video_face_blur_mapper
from
data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
-
from data_juicer.utils.mm_utils import (load_data_with_context, load_video,
-
pil_to_opencv, process_each_frame)
+
from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
+
load_video, pil_to_opencv,
+
process_each_frame)
from ..base_op import OPERATORS, Mapper
from ..op_fusion import LOADED_VIDEOS
@@ -177,7 +178,7 @@
Source code for data_juicer.ops.mapper.video_face_blur_mapper
processed_video_keys[video_key] = output_video_key
if not context:
- video.close()
+ close_video(video)
# when the file is modified, its source file needs to be updated.
for i, value in enumerate(loaded_video_keys):
diff --git a/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html b/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html
index 7c23117e9..b8cbf5a59 100644
--- a/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html
@@ -92,7 +92,8 @@
Source code for data_juicer.ops.mapper.video_remove_watermark_mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints
-from data_juicer.utils.mm_utils import (extract_video_frames_uniformly,
+from data_juicer.utils.mm_utils import (close_video,
+ extract_video_frames_uniformly,
load_data_with_context, load_video,
parse_string_to_roi,
process_each_frame)
@@ -317,7 +318,7 @@ Source code for data_juicer.ops.mapper.video_remove_watermark_mapper
if not context:
for vid_key in videos:
- videos[vid_key].close()
+ close_video(videos[vid_key])
# when the file is modified, its source file needs to be updated.
for i, value in enumerate(sample[self.video_key]):
diff --git a/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html b/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html
index c2173a567..852809345 100644
--- a/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html
@@ -90,7 +90,7 @@ Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints
-from data_juicer.utils.mm_utils import load_video
+from data_juicer.utils.mm_utils import close_video, load_video
from ..base_op import OPERATORS, Mapper
@@ -201,7 +201,7 @@ Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper
original_width = video.codec_context.width
original_height = video.codec_context.height
original_aspect_ratio = Fraction(original_width, original_height)
- container.close()
+ close_video(container)
if (original_aspect_ratio >= self.min_ratio
and original_aspect_ratio <= self.max_ratio):
diff --git a/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html b/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html
index 572c9bbd8..24dc4cbaa 100644
--- a/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html
@@ -92,7 +92,7 @@ Source code for data_juicer.ops.mapper.video_resize_resolution_mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import transfer_filename
from data_juicer.utils.logger_utils import HiddenPrints
-from data_juicer.utils.mm_utils import load_video
+from data_juicer.utils.mm_utils import close_video, load_video
from ..base_op import OPERATORS, Mapper
from ..op_fusion import LOADED_VIDEOS
@@ -186,7 +186,7 @@ Source code for data_juicer.ops.mapper.video_resize_resolution_mapper
width = video.codec_context.width
height = video.codec_context.height
origin_ratio = width / height
- container.close()
+ close_video(container)
if width >= self.min_width and width <= self.max_width and \
height >= self.min_height and height <= self.max_height:
diff --git a/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html
index d36b28733..a274db820 100644
--- a/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html
@@ -90,7 +90,8 @@ Source code for data_juicer.ops.mapper.video_split_by_duration_mapper
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import (add_suffix_to_filename,
transfer_filename)
-from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds,
+from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
+ cut_video_by_seconds,
get_video_duration, load_video)
from ..base_op import OPERATORS, Mapper
@@ -207,7 +208,7 @@ Source code for data_juicer.ops.mapper.video_split_by_duration_mapper
video = videos[video_key]
new_video_keys = self.split_videos_by_duration(
video_key, video)
- video.close()
+ close_video(video)
split_video_keys.extend(new_video_keys)
place_holders.append(SpecialTokens.video *
len(new_video_keys))
diff --git a/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html
index e5e2de283..7ff4ac7cc 100644
--- a/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html
@@ -88,7 +88,8 @@ Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper
<
from data_juicer.utils.constant import Fields
from data_juicer.utils.file_utils import (add_suffix_to_filename,
transfer_filename)
-from data_juicer.utils.mm_utils import (SpecialTokens, cut_video_by_seconds,
+from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
+ cut_video_by_seconds,
get_key_frame_seconds, load_video)
from ..base_op import OPERATORS, Mapper
@@ -189,7 +190,7 @@ Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper
<
video_count]:
video = videos[video_key]
new_video_keys = self.get_split_key_frame(video_key, video)
- video.close()
+ close_video(video)
split_video_keys.extend(new_video_keys)
place_holders.append(SpecialTokens.video *
len(new_video_keys))
diff --git a/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html b/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html
index a78d9a2e9..f13b844bf 100644
--- a/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html
+++ b/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html
@@ -88,7 +88,7 @@ Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper
from data_juicer.utils.availability_utils import AvailabilityChecking
from data_juicer.utils.constant import Fields
-from data_juicer.utils.mm_utils import (extract_key_frames,
+from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
extract_video_frames_uniformly,
load_data_with_context, load_video)
from data_juicer.utils.model_utils import get_model, prepare_model
@@ -194,6 +194,11 @@ Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper
word_count = Counter(words)
sorted_word_list = [item for item, _ in word_count.most_common()]
video_tags.append(sorted_word_list)
+
+ if not context:
+ for vid_key in videos:
+ close_video(videos[vid_key])
+
sample[Fields.video_frame_tags] = video_tags
return sample