From 940b94110d3258b87dc58d74639b219ec15635aa Mon Sep 17 00:00:00 2001 From: yxdyc Date: Fri, 2 Aug 2024 09:48:04 +0000 Subject: [PATCH] deploy: 4d8c521fb8477ceda45f2adad10979624ee0ca6c --- .../ops/filter/video_aesthetics_filter.html | 34 +-- .../ops/filter/video_duration_filter.html | 6 +- .../video_frames_text_similarity_filter.html | 35 +-- .../ops/filter/video_nsfw_filter.html | 33 +-- .../ops/filter/video_watermark_filter.html | 30 ++- .../ops/selector/random_selector.html | 163 +++++++++++++ .../range_specified_field_selector.html | 223 ++++++++++++++++++ .../topk_specified_field_selector.html | 16 +- _modules/index.html | 2 + data_juicer.ops.filter.html | 4 +- data_juicer.ops.selector.html | 103 ++++++++ genindex.html | 16 +- index.html | 2 + objects.inv | Bin 5281 -> 5335 bytes searchindex.js | 2 +- 15 files changed, 592 insertions(+), 77 deletions(-) create mode 100644 _modules/data_juicer/ops/selector/random_selector.html create mode 100644 _modules/data_juicer/ops/selector/range_specified_field_selector.html diff --git a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html index e49760577..809086399 100644 --- a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html +++ b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html @@ -239,23 +239,27 @@

Source code for data_juicer.ops.filter.video_aesthetics_filter

sample[Fields.context][sampled_frames_key] = frames frame_images = [frame.to_image() for frame in frames] - # compute aesthetics_scores - model, processor = get_model(self.model_key, rank, self.use_cuda()) - inputs = processor(images=frame_images, - return_tensors='pt').to(model.device) - with torch.no_grad(): - outputs = model(**inputs) - if self.need_normalized_by_ten: - aesthetics_score = outputs.logits / 10.0 - else: - aesthetics_score = outputs.logits + if len(frame_images) > 0: + # compute aesthetics_scores + model, processor = get_model(self.model_key, rank=rank) + inputs = processor(images=frame_images, + return_tensors='pt').to(model.device) + with torch.no_grad(): + outputs = model(**inputs) + if self.need_normalized_by_ten: + aesthetics_score = outputs.logits / 10.0 + else: + aesthetics_score = outputs.logits - if self.reduce_mode == 'avg': - aesthetics_score = float(aesthetics_score.mean()) - elif self.reduce_mode == 'max': - aesthetics_score = float(aesthetics_score.max()) + if self.reduce_mode == 'avg': + aesthetics_score = float(aesthetics_score.mean()) + elif self.reduce_mode == 'max': + aesthetics_score = float(aesthetics_score.max()) + else: + aesthetics_score = float(aesthetics_score.min()) else: - aesthetics_score = float(aesthetics_score.min()) + aesthetics_score = 0.0 + aesthetics_scores.append(aesthetics_score) logger.debug(f'aesthetics_score: {aesthetics_scores}') diff --git a/_modules/data_juicer/ops/filter/video_duration_filter.html b/_modules/data_juicer/ops/filter/video_duration_filter.html index 61458e793..27c541de5 100644 --- a/_modules/data_juicer/ops/filter/video_duration_filter.html +++ b/_modules/data_juicer/ops/filter/video_duration_filter.html @@ -85,7 +85,7 @@

Source code for data_juicer.ops.filter.video_duration_filter

import sys import numpy as np -from jsonargparse.typing import NonNegativeInt +from jsonargparse.typing import NonNegativeFloat from data_juicer.utils.constant import Fields, StatsKeys from data_juicer.utils.mm_utils import (close_video, load_data_with_context, @@ -104,8 +104,8 @@

Source code for data_juicer.ops.filter.video_duration_filter

"""
[docs] def __init__(self, - min_duration: NonNegativeInt = 0, - max_duration: NonNegativeInt = sys.maxsize, + min_duration: NonNegativeFloat = 0, + max_duration: NonNegativeFloat = sys.maxsize, any_or_all: str = 'any', *args, **kwargs): diff --git a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html index 9e4641138..a60d7b2e5 100644 --- a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html +++ b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html @@ -255,23 +255,26 @@

Source code for data_juicer.ops.filter.video_frames_text_similarity_filterimage = ImageOps.flip(image) video_frame_images_chunk.append(image) - inputs = processor(text=text_chunk, - images=video_frame_images_chunk, - return_tensors='pt', - truncation=True, - max_length=model.config.text_config. - max_position_embeddings, - padding=True).to(model.device) - - outputs = model(**inputs) - chunk_logits = outputs.logits_per_text / 100.0 - - if self.reduce_mode == 'avg': - chunk_similarity = chunk_logits.mean() - elif self.reduce_mode == 'max': - chunk_similarity = chunk_logits.max() + if len(video_frame_images_chunk) > 0: + inputs = processor(text=text_chunk, + images=video_frame_images_chunk, + return_tensors='pt', + truncation=True, + max_length=model.config.text_config. + max_position_embeddings, + padding=True).to(model.device) + + outputs = model(**inputs) + chunk_logits = outputs.logits_per_text / 100.0 + + if self.reduce_mode == 'avg': + chunk_similarity = chunk_logits.mean() + elif self.reduce_mode == 'max': + chunk_similarity = chunk_logits.max() + else: + chunk_similarity = chunk_logits.min() else: - chunk_similarity = chunk_logits.min() + chunk_similarity = 0.0 similarity.append(float(chunk_similarity)) offset += count diff --git a/_modules/data_juicer/ops/filter/video_nsfw_filter.html b/_modules/data_juicer/ops/filter/video_nsfw_filter.html index 81ab4494b..856298073 100644 --- a/_modules/data_juicer/ops/filter/video_nsfw_filter.html +++ b/_modules/data_juicer/ops/filter/video_nsfw_filter.html @@ -219,21 +219,26 @@

Source code for data_juicer.ops.filter.video_nsfw_filter

sample[Fields.context][sampled_frames_key] = frames frame_images = [frame.to_image() for frame in frames] - inputs = processor(images=frame_images, return_tensors='pt') - inputs = inputs.to(model.device) - outputs = model(**inputs) - logits = outputs.logits - cur_scores = [ - scores[1] for scores in torch.softmax(logits, dim=-1) - ] - cur_scores = torch.Tensor(cur_scores) - - if self.reduce_mode == 'avg': - cur_score = cur_scores.mean() - elif self.reduce_mode == 'max': - cur_score = cur_scores.max() + + if len(frame_images) > 0: + inputs = processor(images=frame_images, return_tensors='pt') + inputs = inputs.to(model.device) + outputs = model(**inputs) + logits = outputs.logits + cur_scores = [ + scores[1] for scores in torch.softmax(logits, dim=-1) + ] + cur_scores = torch.Tensor(cur_scores) + + if self.reduce_mode == 'avg': + cur_score = cur_scores.mean() + elif self.reduce_mode == 'max': + cur_score = cur_scores.max() + else: + cur_score = cur_scores.min() else: - cur_score = cur_scores.min() + cur_score = 0.0 + nsfw_scores.append(float(cur_score)) sample[Fields.stats][StatsKeys.video_nsfw_score] = nsfw_scores diff --git a/_modules/data_juicer/ops/filter/video_watermark_filter.html b/_modules/data_juicer/ops/filter/video_watermark_filter.html index c92ced165..8ae26e9a9 100644 --- a/_modules/data_juicer/ops/filter/video_watermark_filter.html +++ b/_modules/data_juicer/ops/filter/video_watermark_filter.html @@ -222,19 +222,25 @@

Source code for data_juicer.ops.filter.video_watermark_filter

sample[Fields.context][sampled_frames_key] = frames frame_images = [frame.to_image() for frame in frames] - inputs = processor(images=frame_images, return_tensors='pt') - inputs = inputs.to(model.device) - outputs = model(**inputs) - logits = outputs.logits - cur_probs = [probs[1] for probs in torch.softmax(logits, dim=-1)] - cur_probs = torch.Tensor(cur_probs) - - if self.reduce_mode == 'avg': - cur_prob = cur_probs.mean() - elif self.reduce_mode == 'max': - cur_prob = cur_probs.max() + + if len(frame_images) > 0: + inputs = processor(images=frame_images, return_tensors='pt') + inputs = inputs.to(model.device) + outputs = model(**inputs) + logits = outputs.logits + cur_probs = [ + probs[1] for probs in torch.softmax(logits, dim=-1) + ] + cur_probs = torch.Tensor(cur_probs) + + if self.reduce_mode == 'avg': + cur_prob = cur_probs.mean() + elif self.reduce_mode == 'max': + cur_prob = cur_probs.max() + else: + cur_prob = cur_probs.min() else: - cur_prob = cur_probs.min() + cur_prob = 0.0 watermark_probs.append(float(cur_prob)) sample[Fields.stats][StatsKeys.video_watermark_prob] = watermark_probs diff --git a/_modules/data_juicer/ops/selector/random_selector.html b/_modules/data_juicer/ops/selector/random_selector.html new file mode 100644 index 000000000..175203884 --- /dev/null +++ b/_modules/data_juicer/ops/selector/random_selector.html @@ -0,0 +1,163 @@ + + + + + + data_juicer.ops.selector.random_selector — data_juicer 0.2.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.ops.selector.random_selector

+from jsonargparse.typing import ClosedUnitInterval, PositiveInt
+
+from data_juicer.format.mixture_formatter import MixtureFormatter
+
+from ..base_op import OPERATORS, Selector
+
+
+
[docs]@OPERATORS.register_module('random_selector') +class RandomSelector(Selector): + """Selector to random select samples. """ + +
[docs] def __init__(self, + select_ratio: ClosedUnitInterval = None, + select_num: PositiveInt = None, + *args, + **kwargs): + """ + Initialization method. + + :param select_ratio: The ratio to select. When both + select_ratio and select_num are set, the value corresponding + to the smaller number of samples will be applied. + :param select_num: The number of samples to select. When both + select_ratio and select_num are set, the value corresponding + to the smaller number of samples will be applied. + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self.select_ratio = select_ratio + self.select_num = select_num
+ +
[docs] def process(self, dataset): + if len(dataset) <= 1: + return dataset + + if self.select_ratio is None and self.select_num is None: + return dataset + + select_num = 0 + if not self.select_ratio: + select_num = self.select_num + else: + select_num = int(self.select_ratio * len(dataset)) + if self.select_num and self.select_num < select_num: + select_num = self.select_num + + return MixtureFormatter.random_sample(dataset, + sample_number=select_num)
+
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/selector/range_specified_field_selector.html b/_modules/data_juicer/ops/selector/range_specified_field_selector.html new file mode 100644 index 000000000..50404a19c --- /dev/null +++ b/_modules/data_juicer/ops/selector/range_specified_field_selector.html @@ -0,0 +1,223 @@ + + + + + + data_juicer.ops.selector.range_specified_field_selector — data_juicer 0.2.0 documentation + + + + + + + + + + + + + + + +
+ + +
+ +
+
+
+ +
+
+
+
+ +

Source code for data_juicer.ops.selector.range_specified_field_selector

+import heapq
+
+from jsonargparse.typing import ClosedUnitInterval, PositiveInt
+
+from data_juicer.utils.common_utils import stats_to_number
+
+from ..base_op import OPERATORS, Selector
+
+
+
[docs]@OPERATORS.register_module('range_specified_field_selector') +class RangeSpecifiedFieldSelector(Selector): + """Selector to select a range of samples based on the sorted + specified field value from smallest to largest. """ + +
[docs] def __init__(self, + field_key: str = '', + lower_percentile: ClosedUnitInterval = None, + upper_percentile: ClosedUnitInterval = None, + lower_rank: PositiveInt = None, + upper_rank: PositiveInt = None, + *args, + **kwargs): + """ + Initialization method. + + :param field_key: Selector based on the specified value + corresponding to the target key. The target key + corresponding to multi-level field information need to be + separated by '.'. + :param lower_percentile: The lower bound of the percentile to + be sample, samples will be selected if their specified field + values are greater than this lower bound. When both + lower_percentile and lower_rank are set, the value corresponding + to the larger number of samples will be applied. + :param upper_percentile: The upper bound of the percentile to + be sample, samples will be selected if their specified field + values are less or equal to the upper bound. When both + upper_percentile and upper_rank are set, the value corresponding + to the smaller number of samples will be applied. + :param lower_rank: The lower bound of the rank to be sample, + samples will be selected if their specified field values are + greater than this lower bound. When both lower_percentile and + lower_rank are set, the value corresponding to the larger number + of samples will be applied. + :param upper_rank: The upper bound of the rank to be sample, + samples will be selected if their specified field values are + less or equal to the upper bound. When both upper_percentile and + upper_rank are set, the value corresponding to the smaller number + of samples will be applied. + :param args: extra args + :param kwargs: extra args + """ + super().__init__(*args, **kwargs) + self.field_key = field_key + self.lower_percentile = lower_percentile + self.upper_percentile = upper_percentile + self.lower_rank = lower_rank + self.upper_rank = upper_rank
+ +
[docs] def process(self, dataset): + if len(dataset) <= 1 or not self.field_key: + return dataset + + if self.lower_percentile is None and self.lower_rank is None: + return dataset + if self.upper_percentile is None and self.upper_rank is None: + return dataset + + lower_bound, upper_bound = 0, len(dataset) + if self.lower_percentile is not None: + lower_bound = int(self.lower_percentile * len(dataset)) + if self.lower_rank is not None: + lower_bound = max(lower_bound, self.lower_rank) + if self.upper_percentile is not None: + upper_bound = int(self.upper_percentile * len(dataset)) + if self.upper_rank is not None: + upper_bound = min(upper_bound, self.upper_rank) + upper_bound = max(lower_bound, upper_bound) + + field_keys = self.field_key.split('.') + assert field_keys[0] in dataset.features.keys( + ), "'{}' not in {}".format(field_keys[0], dataset.features.keys()) + + def get_field_value_list(cur_dataset, field_keys): + if len(field_keys) == 1: + field_value_list = cur_dataset[field_keys[0]] + else: + field_value_list = [] + for item in cur_dataset[field_keys[0]]: + field_value = item + for key in field_keys[1:]: + assert key in field_value.keys( + ), "'{}' not in {}".format(key, field_value.keys()) + field_value = field_value[key] + field_value_list.append(field_value) + field_value_list = [stats_to_number(s) for s in field_value_list] + return field_value_list + + field_value_list = get_field_value_list(dataset, field_keys) + select_index = heapq.nsmallest(int(upper_bound), range(len(dataset)), + field_value_list.__getitem__) + sub_dataset = dataset.select(select_index) + + field_value_list = get_field_value_list(sub_dataset, field_keys) + select_index = heapq.nlargest(int(upper_bound - lower_bound), + range(len(sub_dataset)), + field_value_list.__getitem__) + + return sub_dataset.select(select_index)
+
+ +
+
+
+ +
+ +
+

© Copyright 2024, Data-Juicer Team.

+
+ + Built with Sphinx using a + theme + provided by Read the Docs. + + +
+
+
+
+
+ + + + \ No newline at end of file diff --git a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html index df7008c7a..48cbad3a0 100644 --- a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html +++ b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html @@ -83,21 +83,12 @@

Source code for data_juicer.ops.selector.topk_specified_field_selector

 import heapq
-import sys
 
 from jsonargparse.typing import ClosedUnitInterval, PositiveInt
 
-from ..base_op import OPERATORS, Selector
-
+from data_juicer.utils.common_utils import stats_to_number
 
-def to_number(s, reverse=True):
-    try:
-        return float(s)
-    except Exception:
-        if reverse:
-            return -sys.maxsize
-        else:
-            return sys.maxsize
+from ..base_op import OPERATORS, Selector
 
 
 
[docs]@OPERATORS.register_module('topk_specified_field_selector') @@ -169,7 +160,8 @@

Source code for data_juicer.ops.selector.topk_specified_field_selector

< assert key in field_value.keys(), "'{}' not in {}".format( key, field_value.keys()) field_value = field_value[key] - field_value_list.append(to_number(field_value, self.reverse)) + field_value_list.append( + stats_to_number(field_value, self.reverse)) if self.reverse: select_index = heapq.nlargest(int(select_num), range(len(dataset)), diff --git a/_modules/index.html b/_modules/index.html index 751381c6e..3b8220535 100644 --- a/_modules/index.html +++ b/_modules/index.html @@ -196,6 +196,8 @@

All modules for which code is available

  • data_juicer.ops.mapper.video_tagging_from_frames_mapper
  • data_juicer.ops.mapper.whitespace_normalization_mapper
  • data_juicer.ops.selector.frequency_specified_field_selector
  • +
  • data_juicer.ops.selector.random_selector
  • +
  • data_juicer.ops.selector.range_specified_field_selector
  • data_juicer.ops.selector.topk_specified_field_selector
  • diff --git a/data_juicer.ops.filter.html b/data_juicer.ops.filter.html index 4335f54ca..18c1a115a 100644 --- a/data_juicer.ops.filter.html +++ b/data_juicer.ops.filter.html @@ -1702,12 +1702,12 @@
    -class data_juicer.ops.filter.VideoDurationFilter(min_duration: NonNegativeInt = 0, max_duration: NonNegativeInt = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]¶
    +class data_juicer.ops.filter.VideoDurationFilter(min_duration: NonNegativeFloat = 0, max_duration: NonNegativeFloat = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]¶

    Bases: Filter

    Keep data samples whose videos’ durations are within a specified range.

    -__init__(min_duration: NonNegativeInt = 0, max_duration: NonNegativeInt = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]¶
    +__init__(min_duration: NonNegativeFloat = 0, max_duration: NonNegativeFloat = 9223372036854775807, any_or_all: str = 'any', *args, **kwargs)[source]¶

    Initialization method.

    Parameters:
    diff --git a/data_juicer.ops.selector.html b/data_juicer.ops.selector.html index a9b425b92..5bfc7d485 100644 --- a/data_juicer.ops.selector.html +++ b/data_juicer.ops.selector.html @@ -54,6 +54,8 @@
  • data_juicer.ops.deduplicator
  • data_juicer.ops.selector
  • @@ -141,6 +143,107 @@
    +
    +
    +class data_juicer.ops.selector.RandomSelector(select_ratio: ClosedUnitInterval | None = None, select_num: PositiveInt | None = None, *args, **kwargs)[source]¶
    +

    Bases: Selector

    +

    Selector to random select samples.

    +
    +
    +__init__(select_ratio: ClosedUnitInterval | None = None, select_num: PositiveInt | None = None, *args, **kwargs)[source]¶
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • select_ratio – The ratio to select. When both +select_ratio and select_num are set, the value corresponding +to the smaller number of samples will be applied.

    • +
    • select_num – The number of samples to select. When both +select_ratio and select_num are set, the value corresponding +to the smaller number of samples will be applied.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process(dataset)[source]¶
    +

    Dataset –> dataset.

    +
    +
    Parameters:
    +

    dataset – input dataset

    +
    +
    Returns:
    +

    selected dataset.

    +
    +
    +
    + +
    + +
    +
    +class data_juicer.ops.selector.RangeSpecifiedFieldSelector(field_key: str = '', lower_percentile: ClosedUnitInterval | None = None, upper_percentile: ClosedUnitInterval | None = None, lower_rank: PositiveInt | None = None, upper_rank: PositiveInt | None = None, *args, **kwargs)[source]¶
    +

    Bases: Selector

    +

    Selector to select a range of samples based on the sorted +specified field value from smallest to largest.

    +
    +
    +__init__(field_key: str = '', lower_percentile: ClosedUnitInterval | None = None, upper_percentile: ClosedUnitInterval | None = None, lower_rank: PositiveInt | None = None, upper_rank: PositiveInt | None = None, *args, **kwargs)[source]¶
    +

    Initialization method.

    +
    +
    Parameters:
    +
      +
    • field_key – Selector based on the specified value +corresponding to the target key. The target key +corresponding to multi-level field information need to be +separated by ‘.’.

    • +
    • lower_percentile – The lower bound of the percentile to +be sample, samples will be selected if their specified field +values are greater than this lower bound. When both +lower_percentile and lower_rank are set, the value corresponding +to the larger number of samples will be applied.

    • +
    • upper_percentile – The upper bound of the percentile to +be sample, samples will be selected if their specified field +values are less or equal to the upper bound. When both +upper_percentile and upper_rank are set, the value corresponding +to the smaller number of samples will be applied.

    • +
    • lower_rank – The lower bound of the rank to be sample, +samples will be selected if their specified field values are +greater than this lower bound. When both lower_percentile and +lower_rank are set, the value corresponding to the larger number +of samples will be applied.

    • +
    • upper_rank – The upper bound of the rank to be sample, +samples will be selected if their specified field values are +less or equal to the upper bound. When both upper_percentile and +upper_rank are set, the value corresponding to the smaller number +of samples will be applied.

    • +
    • args – extra args

    • +
    • kwargs – extra args

    • +
    +
    +
    +
    + +
    +
    +process(dataset)[source]¶
    +

    Dataset –> dataset.

    +
    +
    Parameters:
    +

    dataset – input dataset

    +
    +
    Returns:
    +

    selected dataset.

    +
    +
    +
    + +
    +
    class data_juicer.ops.selector.TopkSpecifiedFieldSelector(field_key: str = '', top_ratio: ClosedUnitInterval | None = None, topk: PositiveInt | None = None, reverse: bool = True, *args, **kwargs)[source]¶
    diff --git a/genindex.html b/genindex.html index 365bcb868..7dc29b637 100644 --- a/genindex.html +++ b/genindex.html @@ -338,6 +338,10 @@

    _

  • (data_juicer.ops.Selector method)
  • (data_juicer.ops.selector.FrequencySpecifiedFieldSelector method) +
  • +
  • (data_juicer.ops.selector.RandomSelector method) +
  • +
  • (data_juicer.ops.selector.RangeSpecifiedFieldSelector method)
  • (data_juicer.ops.selector.TopkSpecifiedFieldSelector method)
  • @@ -1058,6 +1062,10 @@

    P

  • (data_juicer.ops.Selector method)
  • (data_juicer.ops.selector.FrequencySpecifiedFieldSelector method) +
  • +
  • (data_juicer.ops.selector.RandomSelector method) +
  • +
  • (data_juicer.ops.selector.RangeSpecifiedFieldSelector method)
  • (data_juicer.ops.selector.TopkSpecifiedFieldSelector method)
  • @@ -1073,6 +1081,10 @@

    R

      +
    • RemoveLongWordsMapper (class in data_juicer.ops.mapper) +
    • RemoveNonChineseCharacterlMapper (class in data_juicer.ops.mapper)
    • RemoveRepeatSentencesMapper (class in data_juicer.ops.mapper) diff --git a/index.html b/index.html index a470794c8..c3b18a91c 100644 --- a/index.html +++ b/index.html @@ -211,6 +211,8 @@

      Tutorialdata_juicer.ops.selector

    • diff --git a/objects.inv b/objects.inv index 080f7e111ddfb3f6fcf51900c910e1f78b2e1686..4650694583a6c9831f288a9bdab077f4ea654a91 100644 GIT binary patch delta 4903 zcmV+?6WHvbDc32Gx_?t8^Ks7Mxg?$!$dtB}{Hx0nR$o-FSI08#a(3}=7wOya%R_t2 zf97X4f4<*v?JFpD3V$9fbSg3^aw>3A9;>bm z2L5}GOYT4!{s7msJVosEOy!6Sgr8W)=Fi%guPT{-I6!FT~{P+Ru+SC1!xyOTinSOwFD&WW za_zI6ZV!HzGZ5feE*pBR`VwxqU_I9G`m^=E@Y^qc$=(eRv~BC`aj4MN!Tb#abBBR> zz`%sRGZPRmt3WVhYI=~SrEL=o+Lq*BOwiBww%V}ItZ7;Lp+DSkkQE=u z3?ukWn4{4&b+v95;UzhvMOTr8CI?1#SAhqzCL>_R@AWLp-z$1$k@*G}eP(*0f0brX26Xh9H08t$c)~ zG8F35QE(BVy6+o_E#ys-kYcEVy+;r11rq8R4l2%bP=t+hM3{+~I`tt!o%(>TQmUL&z%ACW{pMSG%t!iF?Lz0l3RWf2h+a@PRf5 z0Gw+W%pb`%L1IO@J&%zrHkscVXzv8#H@xnH6y)iIvD(eM^Dxn3H~FoBZcZRU9!wak zJ=l)Johv`w)6IcCoMD2TIHRog;<0aglCm-sXd45PwKz1B$e*$={B)lgSbnR0fq$p_ z%)sia;tPGgQw8sflH)YBW~zVqCC+MSx@_M!HyT#7tmt1Q!lzWqq&qU}IG@bpAb2k8 zGzvJ+j_BK+&xfpKO*M>WxAc(|>^aX0h9PPC*Rvf+Ce)GPjSx%%2k!e{6$Xljk#_qb zfONzH;__TAz`LPLwV1vX@j~bKGJm`H!}J#Ww&hf&RTO>|2a3$QJ%}Rosm}tq?y_Qd zlphc?4O8^c={SEx1a-WT1G&wPKPZqIrs#pxasG%v>Ubjs^5c>2S;X)u_|bxwjco)O$s6nS(2^xIPk*|cMtVo| zl-?r{D8_g%<_~Cl8CYVzCi)gw>^CKLL9C;Q>7u4Mj&0%ZS<1?|7Ah1mEmXRl9#J8m%E>#702(0NVd4C+5Dr7-xH(aL{-$SbY%X2n?AcEI?e}iVtJ78~TK-8{! z6P|_=lq{&vAyQR*hg6Nss2Y6)3d!fD!Bu{Vb69n{>^iRwwCLZsI!oNMwj%Z-P`|o= zJ-mNEYdwcgHSdewrCIr*tH-f?ob4(q)^`m)-gpfO+WF6Sx7*M6SbyI|OuXwe^f;c+ zDA0~?v+oS+_ZuSKZ2>!u*8&03>0!D4o_)#+_K6ky_JDI-kBoKRz=4l|fe09h_Xk?j zr2p%lRjkdRrMwXv8VxfGJRW8$U_@?ny5F<(YgwnTNGO=l2&m|A=T!=z<73z0nl99C0F`A&L=+ZFbCZT4(JmJ`ff%cARJ^Eb-!@P@yC8 zmG+am2OUH%A~5^4xKRJ~4DbNxX@C*9qc7R9J4P6BGay5wVB&zs!9)a%#1~dqIeS4? z2{I(qdj%YBy25~T`%u${{Zf}*k@i)S8weT%m@01ElQ{?^88n9O<~uw;{V2p>Rw>+1wMwJSqHT~@&1PAd#xw{MEJZ_R#S6)RF!B(G54_2BRb z7-*uz!oUTN3b^AdL&75vx2vl#Al)v%v7)b&)d)m?Y?p%p9}x>zoak5x(UH0-x6AqOJ9R+BPkA#T#J!X!NOn`R!?@4`KGa8|4R*~_}YdGRWKtnWa$<`lA zEx-8zU_r#EVIqnX784sf zI{5s57ac4<8qxD#v@AgPhhg&w3v6_-Q1`V2aQC$=VD}s3Xw)1K?syY%G)lq;x=e+2 z$i!pK_QuZSzY5%}Xjs5V(y8}Ld{#8C6s3W5s49*-(iAmuFZKtpIH@ zrbAiL?4F5h8B>590nVYM8P>26=Q^JNEh3&vj8LQ+SHXlx@hDQclJQqtj?;1n;=5db zBtV)5)<k*&!=tz@mg9Cpiag#3d^VLagwE^8AdLNU*7#XZF*3n zxFb)g)>Z1xr1W(HO|V9Z)j0u~IFq#3NmB64TE^^KvOlHY_U()z|CTJtXI`n%0&SGr z@bpx2ZWo;^c4vO?I+Z3^yTTz^uouRE@|1Wby#{ELF+GI6^_p}7ZF!-AIyKN{xoug^ zT9d}~$d#fbIJ1y3O^#So2UkiEv5fpje`BP*;F4}s9 z24uU|c@1Dr%UUnXugq_Uw4C9~BaWP}^zodW96*7Auj{<~k(C$nfe zF+i&c+aX!${@G{N0t9+@|Atn7nPxcBKIybhm=@D2tP$&2(@e~IF^#Q`9e?6VLM$mj)dRd9xJy0S`UX(_b+CO3@f1hW~V|)eCClnIo==o zetW~-m&FsSTbMsOSg1Ez0;DfFqs9BOdabkl0qBbc7VC?S0PV}ok!Cr6))N5|?}rGE z_Ttz6BCHi}hpnV=dwQ1Jo`TW1LlgI0P2#1 z3h3HBJC;xEUG|t~<-VrX;T7hW2^#8@kp%A1`|>#afLjChXAT>|8;Jn!%dg|gH?S)r zXarAY@Nh>ymBoG>ZU>tp;YRR9W{7RgWpgOIJSE>4t4Q9T8&j}xbf3%OG}3%Zfy%)_ zF@IA;h>wK~9;#vHiniOn=wL;U47N>+9>Fm^58SLNNh}${g0@P79lqIKFYjHBYOwf3zB4>JW4)plH1~q~+(Z>I$Jle6XcS-=odZPu$TcoGKJ2kAb6@L~D zpuN(fOEhjuiXk_$QZYexyv&I(pyVuhy z6RcNSaJ*G|njhlSWiuQlV~&199$(MPg3z>oW3OX#HA;Zy;L!pk(F~jYar?fpSH27* zGGnAyVw9TH+RgH!b9TH)AO@uDK9vvSV15ZS= zV)X0{@KNoG)u;b@$XeD^efv93N`w_3>X@AgXc^z&YquLZbL5%#=7{%-q>Y_-=}IS~ z%7o7Xl7ugyAzsUY)``<%nuQIC>AkFJ1q<<7kF!#k4$~ow4NFOgb1_VSAb%0hA!fiL zuEi(;QUo}MlB^>3wIBg1&R`A=`LLRSIM-tYC=v0DV%WHrpx2SqC2_BaSSLpXwCEWV z`pF-ZxE3U=)iQ@ODjD_>>3BjtaV`m1r$xatis9Q!J?33aTq*hMq{wg%CCPc5l&{a; zSM6V)$igJ~0HtPhf8l<~=@}4_e%jJgIyLc14i8W!V>*P5y9+8WpJ{ZHdJrlPY%Y?8 zxx(vYI?#=>>v0g1vk*Feb8q6jU+>~9$`HvGe3$xqOz$JjDZh^_Nns4ptcP~Vop!NH zMyDmQ))c}JsXAbn&~_Y7n|4C=N#F|+D1*Ah@lVN^s6|%i)rCBS2an~s#`LP8H$UW8 z5wC2qwPH1xMq#J#Y8vtO$<|sYO9yFG#rGDHTikr$W?}BsI<*c3^o7Ewt&?#P6Mq>L ziiOWH?d7&Ci3g@$M0W^RdLGuikN}0gJ&#G2cD&S`*ApUe$$05$cQ;;toOdN#kVX=q zRFCY`I_`)*+fZ^*E=Q1HJ+_}@VGq3}2@b(}6=;WCUZ)AO-(*3GL4aC4vai_ltTS0K z-yW`3LiSbL43nnBclJ44rU>iMIDgwtQEfc9rep)O8lacx)wCI$)1nx~eyxS)^K_kl z0LF7-a-C#5xT?8f`It3e-MjE~k#qOL^M$uH`=cASdY()z<;joPTxC` ze;ZPR_Zuwtk7sT7Uo>rcm6nP8sKYH16D?@|+GLICmeYCLy;nxSoIbML z3VC`0ErjmiOhHl>yKHaL&VSG9l=CxJ1}Lp+0ZgZPx?Mda@z%_&wq|;*HM`P+7<5iB zdX@dH@_`xKZfK*nCXM{8P9;BcWq{Hu6~J^_r`y#-a@k-Kd)gmdQ<@C(g=Jy8kzZlI z|H=ZZePRh%_tXN6{;;AYWog%UHxoJ(LzX8JuQNTDc+HiQ$oPy3m4E3;XOOm&hHdI? z&nxUgZ`|ho$^vVAI&ph@!*fkA`opHZ>MhSAbTf^;+$#&L@mVx`;2U7{ht0iL%hSwi zTT5MkWd%1rleTYcgfg9wRbA;y9is*dwS29n;b8FYt=*t&Zd~bcv=Y@8hEN`u`%UH(26M_ix&ABiz4fHnaN| z*wjZ&6PD_Ab2`C&zvB&Q9v~j++3PzU;uniypH7(HBk|6k1+g;}L61Y|m)E|RT+H=0 ZjMm9vej`7|@7eP6U+A9w^nce$doLJfhZz6> delta 4815 zcmV;=5-{!8DWNHlx_{H{N|NUso(tl60j0F1Huu==8Uy*id@m$Qq1yGY-DzdW?Z z{AYe;^XH4rPvSLN(EPQ@8so#K8@y01-2wBBuf;<+1A8 zVBo*^xHQi}&Pr%7Uu9g7t1@iOc3HtllNI}%kz_2=p&T5AJojG)@v4qCHX7f`%^)T{ z6(i#FaV=M(f*FLA+BkWURct# z<=S^SJs$inXCT14TsHJr^)=jZ!+NZt__OuC@W(HI$=(eRv~BC`acI!i!Tb#abBBR> zz`%r;nF)y3RUjBLbv;Pa(zXc(ZAM~&s z$gr9n`>xd#To7i%a)xq^V}3zpe^C=mJNzGcq-}B_eUYfvm)03z%F?dN zvxH7PkAGmmurFx_7519+)F)Fri5nA;xr3=87=AF-!2S$#01XX0^`+DfqCd;|SuMfT z9;rR8B?Qy68iJMwRsNX#!KWuB=q{yw@qOz5qyNmiqO}Y|w54QM)<@bJMFKIdsv>Ho)(gXG`duh9xAzoUBg1oi@8tcU`Yg(jzSB{TiLy*7lUOvK7 z84C63D7c7F-S?fu7E+QVq!{X8<>-OEKteskLB)9vim-8x2s07WranYyQyL&gyeeWF=TA9;sk$#ABu%sCAGHGk_sX1M-AnhDU=xMv&}fV*DwhdRvyA82y` zz`3Tu{E=)6BvzE$^Bl=yllh~8_D&#v)9XG+L7q++tKGaiPZKS6lRp~h<^&St!Gy8e zgY7uox$?t3Jsjx6879byGsbbsr})%IGdsAwte5-Xjsv*qAyB>FR7GCcVyOiKAFcs@Lbku z6mXs$(f2!_4_V8aYM9M#=_4uFbDk9pL(=rcvmHn#)REzh5KICG?#Evh28xG~cKacK zbi@MU@?0%I*-(~ROkaw4qw{;2-GBUHD#gBiIhAP@gF+-)Y zMGDY8eSJrpEQuH+9mVC%-Is@dl3%a4zmngtKmB%Nn?XkM#ws3KvV`VImw(epWkgTu zJp+Maj8`y!K;O&260?}-dtk9kO6-bQM-kISO?4dG!r!x$m2oXpC}LWulEt+!wtqj) zJdJY^`vNLkq$>9`V?!2;gdqulfE4&6+Y`?`J^Nt||#n zLk&t6wC511D!xOiMrKrvJ_3d0bJOH1zr{JNI^A}iR|i`3C9cjA_o}Uky$IB=?OzY? zKh9du;nU3fqIYRle(36PE+1#Rii-7JgO4{}LxOhx^WE+C^F7vg5q}f!`V2je=Q9el zaboi}jcBVZr`M&kW})->t= zbjPKS^BlCQ&=PvOlSmDbhz^>1<>(vXmHJ%_Rt44 zX<)8vaiPxZ8Q|{gX@B4mXeySp_d|CI90D0QLJSm&NI_6|&{4Rh35yT|2^Jp#nIMk; zG7H-M&)eItSmWOi@tzCVaqJcd&`xg;v|?DR1xUQtB6u9LMFzOr$WfJq7{Tj^qbivI z?ewsj=59EeA}jV-$5mv!^BRsg5zr9Dh{QHKW;w01_7xus84f#6G!&M2@ldGH5&25{ zle`BV7%3t!`?a`G|Md*;0O)Ce5xAo-*|9rDn3M4b6c{!MG9=V{1sra=!hm%9P}7F} zQkPwk_Dz!=2pR;ODsJABI|w8Kw1$&m2onT~1F&Y3cnBXBW9#b#y0j}pLS0tC;Z7?I zV7G6Iw(paz2pAH0;44GIBM^6}t1uwlF2Av&Z|%7cyqG{;9m#QPpI$44eWJN@^hzOEUK&@`*ac;_`7aU!508jfV^ ze@ZQX^8>(wj`uxl%Dlt(0%Tl#tQCehkydEX5qO~c{g6>pmq*^NVFO`+$H&7&6elbu zHgt6G`F}4uSbQ|1=f7xKfbI{|<`Fj7=wPAlYYE`)YgxeVH^|wjIUwBeCgN6?;f9OEaJbHa-B*uuu8M@h!7n0RhX3z4UwI`r*^qpMYQ#tmBm z+GI?JvZC2N6W2DT067AjLrF92VIj_aJ^@-pJeL@uN;U3+36bJarE(?XUu`)~+Z~AS zc7KroX&P7`iOqQil=$zY2$ZP+JA^);h6%)LYe9fI8Pg~%pR&hEmXcsh_v^Ij zL5<>$Jf&LKs5_I=*9kPi8YNce1Z3h&(q1P?!82K%3>Z zWi{(f8q*_Jijv^WLdG;XVoe=fDM7?C@{cVQO~jtRIm2$95CzXJhK=b{Bt7R(kp;PE z>lGT1?ONw8fH^H|y)3^vza7$YhA)pea+X#KnWGtH@T2?GVUZ)ZXPqcLs7sq>SbyZo zdkK-}KP;M#J(T8mtl1y|!hGj(SM@Oe+tl|K+5R7G{@ z6Nj6|74-l?KEq9}q)ci;YeqF+hi&aSK$`~Ep|L)%`4PXh(g1Y|s6$)J7ryx4{hB_R zMazi+T20sv$;$B0zOoh|(0lkd^nc1U(~ZzRoo*Xv!f$Vn72AKUho4gSU(6I4RzUl$PKA>AtRs1H zyg&5g_J+MLizimMFn@HgP;ayZNMCYBi}z*qT4(zM&=(CX))yTC+LxOn&3|&NCjuni z4-p*g#jpKMSS#KT@lFWXXd^yWSQ}1wcoWXZXbV1N#kU6Qf&hs(LIekTar08qqHn8` zx`dgbU_#wc(V=#9ZtV|_6a;B7Kf#d9r|S@AuJWX*?cx9ggo0_%(f zi#J9lK>ISJn&4}lIV#>22_I<6=e(lb{svb}lTZy9f6Q5R@m;EeNC&wE6gtwG}I|03EZRi<#G4}ZVlLf}!yWll7W;9!9c+q(8^ITuA+|M_&7th_lzd~XB6)vpOu@#{eJ+dBO!FxP zDhCI}e@qb}J{B@~sD_m*+HU)*gB3k8*fuSC1jqC|aI>Z)v1AAf+A0lp1fz5;aGM_J zV?L!jLoJeUBiJJ|z^xf;#jja=_*f)mU9+U!c8^V4_k(P7fpSwtFwlea|H0E)12m=v zJomTvFF)N+n)3q~SykQ-hToS%b%gu~R9J8be>DWaaGmPsZIx#@TQsOpS9JJ5Q^vfO z`E=x%QULSEP5`yaK^50Ep0O0uHjyE=Z!@mA#%;Ufk=EIF;PB3aCANVIUtAM;+Ga%m z1cKPsiBxFU;$(W6pz%&c&h+9O==pyQY6NGZ&Hqn%v}0fIlmLqLMhlL&NKb=zYS?8f ze{32+d!>cH+&|2rE9+F*_5`GJe3}2q!5kX$VK)PD?#BpFBH|gvuyHRzuOq2T;@%OlPL2v_ z(K9CWlRqeNZAe(FWe#UlGMppQ@q~Kf+!C-(i-Knq!;hDG%)6SnQu5bHk>MOllJh(% z-=Dp&+P^-Lg-P-OO3mp0!u^ud86hJ5w56wXYT}g~9-vIdbO;+y7gP$LX?Bx)5GoJs zE|P`0!s}!@(2cU|aT1fX5ITSJXySZc@8T@V5XlyNm->24?<37AzmF_QVGPl%hjz)G zPO(cyrzNr06v7axI$)R3cAQR|PD1rb;0qBbgSy1=Psy05MONq4g}j6ZkLA6_^r~Sr zKjc>tuWYfkVl|jXVW;nEn(_9@)>o!1j0 zaLIh>>2x>Vew(DqmPEqYVxRzuCv>Kq7 z=+*QYoXesZ#eTho=ks!%egejGVRD^hJGiR3VfmOfU_HC=bd&S+!t;%{HT$C*j(VP- zbiNhXOv|rxvJe)(MOYx8&r<9Hx`(p*7S%s90o*zZ71*`iDM)`rKG1)8>D;)Kto{Hj zFVOZ@nYH^AiuJNB=ySzwS>}zg(ipE=Ym2_GVtIbNYJ2{oY16B;OypM`9+8-6LG#xp zYfO)vuG^lyG6LrGk>ys%(;H|ZbU)4%BxSM7_9pH8s!ln-a%F(hsusX>ny1IrpCsOz znbp=zueD}ZS`dGO&Iv}ZvVT~TxcZY^H<-kp z&Lq~(^ZAakN6!u8#V!@{34$C3lOoGzJ3ko}c!t$cmJ` z%u!Ci^0$BeIQ;Xcx7#Lfggd^nysyXEMO%*G8xH=z|NTGxdz48xtY?oy(m!#OtNwrc z8=4DRq!;7Tn`G7-^v~0+oBr0#bn8dq>2WQsnPA(lFX9=^jbP^+9;Sax4?b)4|3y}B z#GRhswB<&4e$#Ab&o8j4kD4Yd)$8VTf%|^PJJLHmKs?g3;yeAsZx+KoT`-p;@yVYB pu`?7wk3;B}*M5*!%=I>m*2!W1Apea&v*p*n&^`O<{{SDSC|u)CYz_bb diff --git a/searchindex.js b/searchindex.js index 9fa459b39..030a6872e 100644 --- a/searchindex.js +++ b/searchindex.js @@ -1 +1 @@ -Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": 1, "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 7, 8, 9], "inform": [1, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": 2, "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": 2, "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": 2, "origin": [2, 3, 8, 9], "expect": [2, 9], "cfg_after_merg": 2, "thi": [3, 5, 6, 7, 8, 9, 10], "It": [3, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "op": [3, 13], "config": [3, 5, 13], "analysi": [3, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "input": [3, 5, 7, 8, 9, 10], "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "oper": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "which": [3, 5, 7, 8, 9], "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 8, 9], "0": [3, 4, 5, 7, 8, 9], "sample_algo": 3, "str": [3, 4, 6, 7, 8, 9, 10], "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "The": [3, 4, 5, 8, 9, 10], "ratio": [3, 4, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "A": [3, 5, 7, 9], "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "static": 3, "to_jsonl": 3, "jsonl": [3, 4], "target": [3, 8, 10], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "work_dir": 3, "show_num": [3, 5, 7], "10": [3, 8, 9], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "directori": [3, 4, 8], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "pair": [3, 5, 7, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "other": [3, 8, 9], "two": [3, 7, 8], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "dataset_path": 4, "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "accord": [4, 5, 8, 9], "kei": [4, 5, 8, 9, 10], "field": [4, 5, 7, 8, 9, 10], "specifi": [4, 6, 8, 9, 10], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "tupl": [4, 8], "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 8, 9], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "py": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "t": [4, 6, 7], "tsx": 4, "vb": 4, "makefil": 4, "xml": 4, "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8], "datasset": 4, "dir": 4, "w1": 4, "d": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "max": [4, 7, 8, 9], "random_sampl": 4, "sample_numb": 4, "seed": 4, "bigger": [4, 9], "than": [4, 6, 7, 8, 9], "we": [4, 7, 8, 9, 13], "instead": [4, 6], "random": [4, 9], "42": 4, "load_op": [5, 13], "process_list": 5, "op_fus": 5, "item": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "instanc": 5, "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stat": [5, 7, 8], "context": [5, 7, 8, 9], "metric": [5, 7, 8], "decid": [5, 7, 8], "var": [5, 7, 8], "temporarili": [5, 7, 8], "For": [5, 7, 8, 9], "level": [5, 6, 7, 8, 9, 10], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "need": [6, 8, 9, 10], "split": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "first": [6, 7, 8, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "now": [6, 9], "set": [6, 8, 9, 10], "contain": [6, 8, 9], "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "ad": [6, 9], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "2": [6, 8, 9], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "videodedupl": [7, 13], "consider_text": 7, "bool": [7, 8, 9, 10], "exact": 7, "match": [7, 8, 9], "consid": [7, 8, 9], "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "positiveint": [7, 8, 9, 10], "6380": 7, "basic": 7, "rai": 7, "although": 7, "implement": 7, "empty_hash_valu": 7, "empti": [7, 9], "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "documentminhashdedupl": [7, 13], "window_s": 7, "5": [7, 8, 9], "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "closedunitinterv": [7, 8, 9, 10], "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "won": 7, "kept": [7, 8], "final": [7, 9], "should": [7, 8, 9], "punctuat": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "pleas": 7, "provid": [7, 9], "shingl": 7, "ignor": [7, 9], "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "min": [7, 8, 9], "jaccard": 7, "similar": [7, 8, 9], "threshold": [7, 8, 9], "detect": [7, 8, 9], "regard": 7, "onli": [7, 8, 9], "band": 7, "lsh": 7, "determin": [7, 10], "optim": 7, "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "rayimagededupl": [7, 13], "phash": 7, "raydocumentdedupl": [7, 13], "ignore_non_charact": 7, "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentdedupl": [7, 13], "md5": 7, "imagededupl": [7, 13], "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9], "rayvideodedupl": [7, 13], "imagetextsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "min_scor": 8, "max_scor": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "any_or_al": [8, 9], "ani": [8, 9], "reduce_mod": 8, "avg": 8, "those": 8, "within": [8, 9, 10], "rang": [8, 9], "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 10], "chunk": 8, "take": 8, "averag": 8, "rank": [8, 9], "videoaspectratiofilt": [8, 13], "min_ratio": [8, 9], "9": [8, 9], "21": [8, 9], "max_ratio": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "minimum": [8, 9], "support": [8, 9], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "score": 8, "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "9223372036854775807": [8, 9], "total": [8, 9], "hug": 8, "face": [8, 9], "below": [8, 9], "exce": [8, 9], "textlengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "length": [8, 9], "specifiednumericfieldfilt": [8, 13], "field_kei": [8, 10], "min_valu": 8, "max_valu": 8, "numer": 8, "multi": [8, 10, 13], "specifiednumericfield": 8, "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "sy": 8, "maxsiz": 8, "iter": [8, 9], "videoaestheticsfilt": [8, 13], "hf_scorer_model": 8, "frame_sampling_method": [8, 9], "frame_num": [8, 9], "3": [8, 9], "aesthet": 8, "frame": [8, 9], "predictor": 8, "By": 8, "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "larg": 8, "while": 8, "usual": 8, "small": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "larger": [8, 9], "addit": [8, 9], "durat": [8, 9], "must": [8, 9], "keyword": [8, 9], "perplexityfilt": [8, 13], "lang": [8, 9], "max_ppl": 8, "positivefloat": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": 8, "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "confid": 8, "area": 8, "out": 8, "account": 8, "more": [8, 9, 13], "maximumlinelengthfilt": [8, 13], "averagelinelengthfilt": [8, 13], "specifiedfieldfilt": [8, 13], "target_valu": 8, "retain": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "shift": 8, "found": [8, 9], "http": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "requir": 8, "equal": [8, 9], "depend": [8, 9], "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "zh": 8, "mini_dependency_num": 8, "edg": 8, "objet": 8, "videoresolutionfilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "resolut": [8, 9], "alphanumericfilt": [8, 13], "25": 8, "count": 8, "alphanumer": 8, "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "8": [8, 9], "watermark": [8, 9], "high": 8, "probabl": [8, 9], "imageaestheticsfilt": [8, 13], "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "mb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "home": 8, "runner": 8, "asset": 8, "what": 8, "adopt": 8, "avail": 8, "join": 8, "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "imageshapefilt": [8, 13], "shape": 8, "width": [8, 9], "height": [8, 9], "videodurationfilt": [8, 13], "min_dur": 8, "nonnegativeint": [8, 9], "max_dur": 8, "second": [8, 9], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "mini_action_num": 8, "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "here": [8, 9, 13], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videonsfwfilt": [8, 13], "specialcharactersfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "relat": 8, "exampl": 8, "chineseclip": 8, "might": [8, 9], "choic": 8, "imageaspectratiofilt": [8, 13], "333": 8, "audiodurationfilt": [8, 13], "languageidscorefilt": [8, 13], "identif": 8, "suffixfilt": [8, 13], "imagesizefilt": [8, 13], "videowatermarkfilt": [8, 13], "wordsnumfilt": [8, 13], "imagefaceratiofilt": [8, 13], "largest": 8, "flaggedwordfilt": [8, 13], "045": 8, "flagged_words_dir": 8, "flag": 8, "flagged_word": 8, "wordrepetitionfilt": [8, 13], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "sequenc": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "opencv": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9], "being": [8, 9], "overrul": 8, "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "videocaptioningfromaudiomapp": [9, 13], "keep_original_sampl": 9, "caption": 9, "stream": 9, "qwen": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "spectrogram": 9, "transform": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "temperatur": 9, "system_prompt": 9, "user_prompt": 9, "user_prompt_kei": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "request": 9, "control": 9, "output": 9, "prompt": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "respons": 9, "guid": 9, "uers_prompt_kei": 9, "punctuationnormalizationmapp": [9, 13], "unicod": 9, "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "latex": 9, "sentencesplitmapp": [9, 13], "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "nonnegativefloat": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "cut": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "pass": 9, "progress": 9, "cleanipmapp": [9, 13], "repl": 9, "clean": 9, "ipv4": 9, "ipv6": 9, "address": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "begin": 9, "drop": 9, "removetabletextmapp": [9, 13], "min_col": 9, "from_2_to_20": 9, "max_col": 9, "20": 9, "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "except": 9, "letter": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "hf_img2seq": 9, "blip2": 9, "opt": 9, "7b": 9, "point": 9, "fp16": 9, "bf16": 9, "version": 9, "branch": 9, "commit": 9, "id": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "enabl": 9, "produc": 9, "keep_candidate_mod": 9, "caption_num": 9, "candid": 9, "random_ani": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "batch": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "kernel": 9, "videoffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "nlpcdazhmapp": [9, 13], "sequenti": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "delete_random_char": 9, "swap_random_char": 9, "replace_equivalent_num": 9, "simpli": 9, "nlpcda": 9, "librari": 9, "you": 9, "time": 9, "semant": 9, "significantli": 9, "notic": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "delet": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "swap": 9, "contigu": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "imageblurmapp": [9, 13], "p": 9, "blure": 9, "cleancopyrightmapp": [9, 13], "copyright": 9, "comment": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "abov": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "cleanhtmlmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "videotaggingfromframesmapp": [9, 13], "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "extractqamapp": [9, 13], "hf_model": 9, "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "qa_format": 9, "chatml": 9, "question": 9, "answer": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "train": 9, "suitabl": 9, "hugginfac": 9, "interfac": 9, "follow": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imagecaptioningmapp": [9, 13], "prompt_kei": 9, "anoth": 9, "how": 9, "mani": 9, "similar_on": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "fixunicodemapp": [9, 13], "fix": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "nlpaugenmapp": [9, 13], "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "insert_random_char": 9, "nlpaug": 9, "love": 9, "llm": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "videocaptioningfromframesmapp": [9, 13], "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "cleanemailmapp": [9, 13], "email": 9, "replacecontentmapp": [9, 13], "design": 9, "audioffmpegwrappedmapp": [9, 13], "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videofaceblurmapp": [9, 13], "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "see": 13, "detail": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractQAMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExtractQAMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process"]], "process() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process"]], "process() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process"]], "process() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process"]], "process() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process"]], "process() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process"]], "process() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process"]], "process() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process"]], "process() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process"]], "process() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process"]], "process() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process"]], "process() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process"]], "process() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process"]], "process() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process"]], "process() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process"]], "process() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process"]], "process() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process"]], "process() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process"]], "process() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process"]], "process() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process"]], "process() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process"]], "process() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process"]], "process() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process"]], "process() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process"]], "process() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process"]], "process() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process"]], "process() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process"]], "process() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process"]], "process() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process"]], "process() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process"]], "process() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process"]], "process() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process"]], "process() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process"]], "process() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process"]], "process() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process"]], "process() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process"]], "process() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process"]], "process() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractQAMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "process() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process"]], "process() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process"]], "process() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process"]], "process() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process"]], "process() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process"]], "process() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process"]], "process() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process"]], "process() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.process"]], "process() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process"]], "process() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process"]], "process() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process"]], "process() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process"]], "process() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process"]], "process() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process"]], "process() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process"]], "process() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process"]], "process() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process"]], "process() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process"]], "process() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process"]], "process() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process"]], "process() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process"]], "process() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process"]], "process() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process"]], "process() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process"]], "process() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process"]], "process() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file +Search.setIndex({"docnames": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "index", "modules"], "filenames": ["data_juicer.rst", "data_juicer.analysis.rst", "data_juicer.config.rst", "data_juicer.core.rst", "data_juicer.format.rst", "data_juicer.ops.rst", "data_juicer.ops.common.rst", "data_juicer.ops.deduplicator.rst", "data_juicer.ops.filter.rst", "data_juicer.ops.mapper.rst", "data_juicer.ops.selector.rst", "data_juicer.tools.rst", "data_juicer.utils.rst", "index.rst", "modules.rst"], "titles": ["data_juicer", "data_juicer.analysis", "data_juicer.config", "data_juicer.core", "data_juicer.format", "data_juicer.ops", "data_juicer.ops.common", "data_juicer.ops.deduplicator", "data_juicer.ops.filter", "data_juicer.ops.mapper", "data_juicer.ops.selector", "data_juicer.tools", "data_juicer.utils", "Welcome to data-juicer\u2019s documentation!", "data_juicer"], "terms": {"cuda_device_count": [0, 14], "sourc": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "is_cuda_avail": [0, 14], "class": [1, 3, 4, 5, 7, 8, 9, 10], "columnwiseanalysi": [1, 3, 13], "dataset": [1, 3, 4, 5, 7, 8, 9, 10], "output_path": 1, "overall_result": 1, "none": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "save_stats_in_one_fil": 1, "true": [1, 2, 3, 5, 6, 7, 8, 9, 10], "base": [1, 3, 4, 5, 7, 8, 9, 10], "object": [1, 2, 3, 8], "appli": [1, 3, 7, 9, 10], "each": [1, 3, 5, 7, 9], "column": [1, 3, 9], "stat": [1, 3, 5, 7, 8], "respect": [1, 9], "__init__": [1, 3, 4, 5, 7, 8, 9, 10], "initi": [1, 2, 3, 4, 7, 8, 9, 10], "method": [1, 3, 4, 6, 7, 8, 9, 10], "paramet": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "analyz": [1, 2, 3, 13], "path": [1, 2, 3, 4, 7], "store": [1, 3, 4, 5, 7, 8, 9], "result": [1, 3, 8], "option": [1, 3, 4], "precomput": 1, "overal": 1, "whether": [1, 2, 3, 4, 5, 6, 7, 8, 9], "save": [1, 2, 3], "all": [1, 3, 6, 8, 9], "figur": [1, 3, 9], "one": [1, 2, 6, 7, 8, 9], "imag": [1, 5, 7, 8, 9], "file": [1, 2, 3, 4, 5, 8, 9], "show_percentil": 1, "fals": [1, 2, 3, 4, 5, 6, 7, 8, 9], "show": [1, 3, 9], "skip_export": [1, 3], "draw": 1, "percentil": [1, 10], "line": [1, 2, 8, 9], "sub": [1, 6, 7], "If": [1, 3, 7, 8, 9], "": [1, 3, 7, 8, 9], "sever": [1, 9], "red": 1, "indic": [1, 9], "quantil": 1, "distribut": [1, 3, 9], "singl": [1, 3, 9], "window": [1, 7], "after": [1, 3, 6, 7, 8, 9], "disk": [1, 3], "return": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], "draw_hist": 1, "ax": 1, "data": [1, 4, 5, 8, 9], "save_path": 1, "histogram": 1, "includ": [1, 7, 8, 9], "inform": [1, 5, 7, 8, 10], "draw_box": 1, "box": [1, 9], "plot": 1, "diversityanalysi": [1, 13], "lang_or_model": 1, "en": [1, 8, 9], "divers": [1, 9], "sampl": [1, 3, 4, 5, 7, 8, 9, 10], "get": [1, 6], "an": [1, 3, 4, 5, 7, 8, 9], "param": [1, 2, 4, 6, 7, 9], "model": [1, 6, 7, 8, 9, 13], "specif": [1, 3, 5, 7, 8, 9], "languag": [1, 7, 8, 9], "us": [1, 2, 3, 4, 5, 6, 7, 8, 9, 13], "load": [1, 3, 4, 5, 9], "comput": [1, 3, 5, 6, 7, 8], "column_nam": 1, "text": [1, 4, 5, 7, 8, 9], "lexic": 1, "tree": [1, 8], "name": [1, 3, 4, 5, 8, 9], "postproc_func": 1, "function": [1, 6, 7], "get_divers": 1, "postproc_kwarg": 1, "whole": [1, 8], "In": [1, 3], "default": [1, 2, 3, 4, 7, 8, 9], "argument": [1, 3, 5, 8, 9], "overallanalysi": [1, 3, 13], "mean": [1, 3, 9], "std": 1, "etc": [1, 3, 4], "refine_single_column": 1, "col": 1, "num_proc": [1, 3, 4], "1": [1, 3, 4, 8, 9], "describ": 1, "panda": 1, "number": [1, 3, 4, 5, 7, 8, 9, 10], "process": [1, 3, 4, 5, 6, 7, 8, 9, 10, 13], "export": [1, 3, 4, 5, 13], "init_config": [2, 13], "arg": [2, 3, 4, 5, 7, 8, 9, 10], "jsonargpars": 2, "parser": 2, "pars": [2, 9], "from": [2, 3, 4, 5, 6, 7, 8, 9, 10], "posix": 2, "style": 2, "command": [2, 4, 9], "yaml": 2, "json": [2, 3, 4, 8], "jsonnet": 2, "superset": 2, "environ": 2, "variabl": [2, 5], "hard": 2, "code": [2, 9], "list": [2, 3, 4, 5, 6, 8, 9], "e": [2, 3, 4, 8, 9], "g": [2, 3, 4, 9], "conifg": 2, "cfg": [2, 3, 4], "defaut": 2, "global": [2, 4, 9], "executor": [2, 3, 13], "export_config": [2, 13], "format": [2, 3, 8, 9, 13], "skip_non": 2, "skip_check": 2, "overwrit": [2, 9], "multifil": 2, "some": [2, 9], "ar": [2, 3, 6, 7, 8, 9, 10], "namespac": 2, "type": [2, 4, 9], "json_ind": 2, "parser_mod": 2, "exclud": 2, "entri": 2, "whose": [2, 8, 9], "valu": [2, 5, 7, 8, 9, 10], "i": [2, 3, 4, 5, 6, 7, 8, 9], "skip": 2, "check": 2, "exist": 2, "multipl": [2, 3, 4, 6, 7, 8], "__path__": 2, "meta": [2, 4], "merge_config": [2, 13], "ori_cfg": 2, "new_cfg": 2, "dict": [2, 3, 9], "merg": [2, 4, 6, 8], "configur": 2, "origin": [2, 3, 8, 9], "expect": [2, 9], "cfg_after_merg": 2, "thi": [3, 5, 6, 7, 8, 9, 10], "It": [3, 7, 8, 9], "filter": [3, 5, 7, 9, 13], "op": [3, 13], "config": [3, 5, 13], "analysi": [3, 13], "gener": [3, 9], "tabl": [3, 9], "help": 3, "user": 3, "understand": 3, "input": [3, 5, 7, 8, 9, 10], "better": [3, 8], "run": [3, 5, 8, 9], "load_data_np": 3, "pipelin": 3, "worker": 3, "when": [3, 4, 5, 7, 8, 9, 10], "nesteddataset": [3, 13], "karg": 3, "djdataset": 3, "enhanc": 3, "huggingfac": [3, 4, 8, 9], "usabl": 3, "effici": 3, "oper": 3, "checkpoint": 3, "tracer": [3, 5, 7, 13], "map": [3, 9], "overrid": 3, "func": 3, "which": [3, 5, 7, 8, 9], "call": 3, "most": [3, 9], "common": [3, 13], "can": [3, 8, 9], "access": 3, "nest": 3, "manner": 3, "select": [3, 4, 5, 8, 10], "classmethod": [3, 4], "from_dict": 3, "from_xx": 3, "constructor": 3, "construct": 3, "add_column": 3, "add": [3, 4], "select_column": 3, "remove_column": 3, "remov": [3, 5, 6, 8, 9], "cleanup_cache_fil": 3, "clear": 3, "raw": 3, "compress": 3, "cach": [3, 8], "unifi": [3, 4], "order": [3, 10], "sample_data": 3, "dataset_to_sampl": 3, "sample_ratio": 3, "float": [3, 8, 9], "0": [3, 4, 5, 7, 8, 9], "sample_algo": 3, "str": [3, 4, 6, 7, 8, 9, 10], "uniform": [3, 8, 9], "kwarg": [3, 4, 5, 7, 8, 9, 10], "subset": [3, 4], "given": [3, 8, 9], "formatt": [3, 4], "link": [3, 9], "The": [3, 4, 5, 8, 9, 10], "ratio": [3, 4, 6, 8, 9, 10], "size": [3, 6, 7, 8, 9], "algorithm": [3, 7, 9], "frequency_specified_field_selector": 3, "topk_specified_field_selector": 3, "A": [3, 5, 7, 9], "export_path": 3, "export_shard_s": 3, "export_in_parallel": 3, "export_d": 3, "keep_stats_in_res_d": 3, "keep_hashes_in_res_d": 3, "export_stat": 3, "kib": 3, "1024": 3, "mib": 3, "1048576": 3, "gib": 3, "1073741824": 3, "tib": 3, "1099511627776": 3, "shard": 3, "content": [3, 9], "keep": [3, 5, 7, 8, 9], "hash": [3, 5, 7], "export_compute_stat": 3, "statu": 3, "static": 3, "to_jsonl": 3, "jsonl": [3, 4], "target": [3, 8, 10], "extra": [3, 4, 7, 8, 9, 10], "to_json": 3, "to_parquet": 3, "parquet": [3, 4], "work_dir": 3, "show_num": [3, 5, 7], "10": [3, 8, 9], "trace": [3, 5, 7], "chang": [3, 9], "befor": [3, 8], "comparison": 3, "work": [3, 8, 9], "directori": [3, 4, 8], "maximum": [3, 8, 9], "trace_mapp": 3, "op_nam": 3, "previous_d": 3, "processed_d": 3, "text_kei": [3, 4, 5], "compar": 3, "mapper": [3, 5, 13], "mainli": 3, "differ": [3, 4, 6, 7, 8, 9], "pair": [3, 5, 7, 9], "due": 3, "modif": 3, "trace_batch_mapp": 3, "batchmapp": 3, "new": [3, 4, 9], "augment": [3, 6, 8, 9], "trace_filt": 3, "trace_dedupl": 3, "dup_pair": 3, "dedupl": [3, 5, 9, 13], "duplic": [3, 5, 7], "extract": [3, 8, 9], "other": [3, 8, 9], "two": [3, 7, 8], "embed": 3, "independ": [3, 8, 9], "obtain": [3, 6], "load_formatt": [4, 13], "dataset_path": 4, "suffix": [4, 8], "add_suffix": 4, "baseformatt": 4, "mixtur": 4, "weight": [4, 7, 9], "accord": [4, 5, 8, 9], "kei": [4, 5, 8, 9, 10], "field": [4, 5, 7, 8, 9, 10], "specifi": [4, 6, 8, 9, 10], "info": [4, 5], "jsonformatt": [4, 13], "localformatt": [4, 13], "zst": 4, "tupl": [4, 8], "local": 4, "packag": 4, "modul": [4, 13], "csv": 4, "load_dataset": 4, "int": [4, 8, 9], "global_cfg": 4, "its": [4, 5, 7, 9], "consequ": 4, "remoteformatt": [4, 13], "repositori": 4, "hub": 4, "textformatt": [4, 13], "txt": [4, 8], "pdf": [4, 8], "cpp": 4, "docx": [4, 8], "md": 4, "tex": [4, 9], "asm": 4, "bat": 4, "cmd": 4, "c": 4, "h": [4, 8, 9], "hpp": 4, "cc": 4, "hh": 4, "cmake": 4, "css": 4, "dockerfil": 4, "f90": 4, "f": 4, "f03": 4, "f08": 4, "f77": 4, "f95": 4, "fpp": 4, "go": 4, "html": [4, 9], "java": 4, "j": 4, "jl": 4, "lua": 4, "markdown": 4, "php": 4, "php3": 4, "php4": 4, "php5": 4, "phpt": 4, "pl": 4, "pm": 4, "pod": 4, "perl": 4, "ps1": 4, "psd1": 4, "psm1": 4, "py": 4, "rb": 4, "r": 4, "sql": 4, "scala": 4, "sh": 4, "bash": 4, "zsh": 4, "t": [4, 6, 7], "tsx": 4, "vb": 4, "makefil": 4, "xml": 4, "rst": 4, "m": [4, 9], "smali": 4, "datas": 4, "unified_format_dataset": 4, "parquetformatt": [4, 13], "csvformatt": [4, 13], "tsvformatt": [4, 13], "tsv": 4, "delimit": 4, "mixtureformatt": [4, 13], "max_sampl": 4, "mix": 4, "randomli": [4, 9], "everi": 4, "them": [4, 7, 8], "datasset": 4, "dir": 4, "w1": 4, "d": 4, "w2": 4, "ds_dir": 4, "w3": 4, "ds_file": 4, "max": [4, 7, 8, 9], "random_sampl": 4, "sample_numb": 4, "seed": 4, "bigger": [4, 9], "than": [4, 6, 7, 8, 9, 10], "we": [4, 7, 8, 9, 13], "instead": [4, 6], "random": [4, 9, 10], "42": 4, "load_op": [5, 13], "process_list": 5, "op_fus": 5, "item": 5, "fuse": 5, "share": 5, "same": 5, "intermedi": [5, 7, 8], "instanc": 5, "image_kei": 5, "audio_kei": 5, "audio": [5, 8, 9], "video_kei": [5, 9], "video": [5, 7, 8, 9], "compute_stat": [5, 7, 8], "context": [5, 7, 8, 9], "metric": [5, 7, 8], "decid": [5, 7, 8], "var": [5, 7, 8], "temporarili": [5, 7, 8], "For": [5, 7, 8, 9], "level": [5, 6, 7, 8, 9, 10], "boolean": [5, 7, 8], "conduct": 5, "edit": 5, "compute_hash": [5, 7], "doc": [5, 7], "open": [5, 7, 9], "selector": [5, 13], "get_sentences_from_docu": [6, 13], "document": [6, 7, 8, 9], "model_func": 6, "sentenc": [6, 9], "need": [6, 8, 9, 10], "split": [6, 9], "splite": 6, "separ": [6, 8, 10], "n": [6, 8, 9], "get_words_from_docu": [6, 13], "token_func": 6, "new_lin": 6, "tab": 6, "word": [6, 8, 9], "like": [6, 7, 8, 9], "stopword": [6, 8], "token": [6, 7, 8, 9], "merge_on_whitespace_tab_newlin": [6, 13], "invert": 6, "split_on_newline_tab_whitespac": [6, 13], "concaten": [6, 9], "first": [6, 7, 8, 9], "split_on_whitespac": [6, 13], "also": 6, "space": [6, 7], "tag": [6, 8, 9], "strip": [6, 13], "strip_charact": 6, "wai": [6, 9], "faster": 6, "sinc": 6, "now": [6, 9], "set": [6, 8, 9, 10], "contain": [6, 8, 9], "lot": 6, "element": 6, "emoji": 6, "charact": [6, 7, 8, 9], "words_augment": [6, 13], "group_siz": 6, "join_char": 6, "especi": [6, 8], "chines": [6, 7, 8, 9], "without": [6, 9], "between": [6, 7, 8, 9], "vietnames": [6, 8], "syllabl": 6, "group": [6, 8], "ad": [6, 9], "words_refin": [6, 13], "lower_cas": 6, "strip_char": 6, "use_words_aug": [6, 8], "words_aug_group_s": [6, 8], "2": [6, 8, 9], "words_aug_join_char": [6, 8], "refin": 6, "non": [6, 7, 9], "revers": [6, 10], "special": [6, 8, 9], "convert": [6, 7, 9], "lower": [6, 7, 8, 9, 10], "case": [6, 7, 8, 9, 13], "lowercas": [6, 7, 9], "char": [6, 8, 9], "videodedupl": [7, 13], "consider_text": 7, "bool": [7, 8, 9, 10], "exact": 7, "match": [7, 8, 9], "consid": [7, 8, 9], "togeth": [7, 9], "raybasicdedupl": [7, 13], "redis_host": 7, "localhost": 7, "redis_port": 7, "positiveint": [7, 8, 9, 10], "6380": 7, "basic": 7, "rai": 7, "although": 7, "implement": 7, "empty_hash_valu": 7, "empti": [7, 9], "hostnam": 7, "redi": 7, "server": 7, "port": 7, "calculate_hash": 7, "calcul": [7, 8], "documentminhashdedupl": [7, 13], "window_s": 7, "5": [7, 8, 9], "ignore_pattern": 7, "num_permut": 7, "256": 7, "jaccard_threshold": 7, "closedunitinterv": [7, 8, 9, 10], "7": [7, 9], "num_band": 7, "num_rows_per_band": 7, "tokenizer_model": 7, "minhashlsh": 7, "simhash": 7, "minhash": 7, "byte": [7, 8], "so": [7, 8, 9], "thei": 7, "won": 7, "kept": [7, 8], "final": [7, 9], "should": [7, 8, 9], "punctuat": [7, 9], "sentencepiec": 7, "english": [7, 8, 9], "recommend": [7, 9], "pleas": 7, "provid": [7, 9], "shingl": 7, "ignor": [7, 9], "string": [7, 8, 9], "pattern": [7, 9], "permut": 7, "min": [7, 8, 9], "jaccard": 7, "similar": [7, 8, 9], "threshold": [7, 8, 9], "detect": [7, 8, 9], "regard": 7, "onli": [7, 8, 9], "band": 7, "lsh": 7, "determin": [7, 10], "optim": 7, "minim": 7, "sum": 7, "prob": 7, "posit": [7, 8, 9], "neg": [7, 9], "row": 7, "rayimagededupl": [7, 13], "phash": 7, "raydocumentdedupl": [7, 13], "ignore_non_charact": 7, "alphabet": [7, 8, 9], "whitespac": [7, 9], "digit": 7, "documentdedupl": [7, 13], "md5": 7, "imagededupl": [7, 13], "documentsimhashdedupl": [7, 13], "6": [7, 8], "num_block": 7, "hamming_dist": 7, "4": [7, 8, 9], "And": 7, "block": 7, "ham": 7, "distanc": 7, "alwai": 7, "less": [7, 8, 9, 10], "rayvideodedupl": [7, 13], "imagetextsimilarityfilt": [8, 13], "hf_clip": 8, "openai": 8, "clip": [8, 9], "vit": 8, "patch32": 8, "min_scor": 8, "max_scor": 8, "horizontal_flip": [8, 9], "vertical_flip": [8, 9], "any_or_al": [8, 9], "ani": [8, 9], "reduce_mod": 8, "avg": 8, "those": 8, "within": [8, 9, 10], "rang": [8, 9, 10], "flip": [8, 9], "horizont": [8, 9], "left": [8, 9], "right": [8, 9], "vertic": [8, 9], "top": [8, 9, 10], "bottom": [8, 9], "strategi": [8, 9], "meet": [8, 9], "condit": [8, 9], "reduc": [8, 9], "mode": [8, 9], "correspond": [8, 10], "chunk": 8, "take": 8, "averag": 8, "rank": [8, 9, 10], "videoaspectratiofilt": [8, 13], "min_ratio": [8, 9], "9": [8, 9], "21": [8, 9], "max_ratio": [8, 9], "aspect": [8, 9], "aspectratio": [8, 9], "w": [8, 9], "minimum": [8, 9], "support": [8, 9], "imagetextmatchingfilt": [8, 13], "hf_blip": 8, "salesforc": [8, 9], "blip": [8, 9], "itm": 8, "coco": 8, "003": 8, "score": 8, "imagensfwfilt": [8, 13], "hf_nsfw_model": 8, "falconsai": 8, "nsfw_image_detect": 8, "score_threshold": 8, "have": 8, "low": 8, "nsfw": 8, "tokennumfilt": [8, 13], "hf_token": 8, "eleutherai": 8, "pythia": 8, "9b": 8, "dedup": 8, "min_num": 8, "max_num": 8, "9223372036854775807": [8, 9], "total": [8, 9], "hug": 8, "face": [8, 9], "below": [8, 9], "exce": [8, 9], "textlengthfilt": [8, 13], "min_len": [8, 9], "max_len": [8, 9], "length": [8, 9], "specifiednumericfieldfilt": [8, 13], "field_kei": [8, 10], "min_valu": 8, "max_valu": 8, "numer": 8, "multi": [8, 10, 13], "specifiednumericfield": 8, "audionmfsnrfilt": [8, 13], "min_snr": 8, "max_snr": 8, "nmf_iter_num": 8, "500": [8, 9], "snr": 8, "nmf": 8, "db": 8, "sy": 8, "maxsiz": 8, "iter": [8, 9], "videoaestheticsfilt": [8, 13], "hf_scorer_model": 8, "frame_sampling_method": [8, 9], "frame_num": [8, 9], "3": [8, 9], "aesthet": 8, "frame": [8, 9], "predictor": 8, "By": 8, "shunk031": 8, "v2": 8, "sac": 8, "logo": 8, "ava1": 8, "l14": 8, "linearms": 8, "refer": [8, 9], "pypi": 8, "org": [8, 9], "project": 8, "simpl": [8, 9], "predict": 8, "all_keyfram": [8, 9], "former": [8, 9], "latter": [8, 9], "uniformli": [8, 9], "keyfram": 8, "larg": 8, "while": 8, "usual": 8, "small": 8, "term": 8, "middl": [8, 9], "last": [8, 9], "larger": [8, 9, 10], "addit": [8, 9], "durat": [8, 9], "must": [8, 9], "keyword": [8, 9], "perplexityfilt": [8, 13], "lang": [8, 9], "max_ppl": 8, "positivefloat": 8, "1500": 8, "perplex": 8, "phrasegroundingrecallfilt": [8, 13], "hf_owlvit": 8, "googl": 8, "owlvit": 8, "min_recal": 8, "max_recal": 8, "iou_thr": 8, "large_area_ratio_thr": 8, "95": 8, "conf_thr": 8, "locat": [8, 9], "recal": 8, "phrase": 8, "owl": 8, "ground": 8, "iou": 8, "nm": 8, "post": 8, "bbox": 8, "overlap": 8, "confid": 8, "area": 8, "out": 8, "account": 8, "more": [8, 9, 13], "maximumlinelengthfilt": [8, 13], "averagelinelengthfilt": [8, 13], "specifiedfieldfilt": [8, 13], "target_valu": 8, "retain": [8, 9], "videotaggingfromframesfilt": [8, 13], "peopl": 8, "shift": 8, "found": [8, 9], "http": [8, 9], "github": 8, "com": 8, "xinyu1205": 8, "recogn": 8, "anyth": 8, "blob": 8, "main": [8, 9], "ram": 8, "ram_tag_list": 8, "noqa": 8, "e501": 8, "requir": 8, "equal": [8, 9, 10], "depend": [8, 9], "textentitydependencyfilt": [8, 13], "min_dependency_num": 8, "identifi": [8, 9], "entiti": 8, "omit": 8, "zh": 8, "mini_dependency_num": 8, "edg": 8, "objet": 8, "videoresolutionfilt": [8, 13], "min_width": [8, 9], "max_width": [8, 9], "min_height": [8, 9], "max_height": [8, 9], "resolut": [8, 9], "alphanumericfilt": [8, 13], "25": 8, "count": 8, "alphanumer": 8, "imagewatermarkfilt": [8, 13], "hf_watermark_model": 8, "amrul": 8, "hzz": 8, "watermark_detector": 8, "prob_threshold": 8, "8": [8, 9], "watermark": [8, 9], "high": 8, "probabl": [8, 9], "imageaestheticsfilt": [8, 13], "audiosizefilt": [8, 13], "min_siz": 8, "max_siz": 8, "1tb": 8, "kb": 8, "mb": 8, "constraint": 8, "approxim": 8, "un": 8, "limit": 8, "stopwordsfilt": [8, 13], "stopwords_dir": 8, "home": 8, "runner": 8, "asset": 8, "what": 8, "adopt": 8, "avail": 8, "join": 8, "characterrepetitionfilt": [8, 13], "rep_len": 8, "gram": 8, "repetit": 8, "imageshapefilt": [8, 13], "shape": 8, "width": [8, 9], "height": [8, 9], "videodurationfilt": [8, 13], "min_dur": 8, "nonnegativefloat": [8, 9], "max_dur": 8, "second": [8, 9], "textactionfilt": [8, 13], "min_action_num": 8, "action": 8, "mini_action_num": 8, "videoocrarearatiofilt": [8, 13], "min_area_ratio": 8, "max_area_ratio": 8, "frame_sample_num": 8, "languages_to_detect": 8, "ch_sim": 8, "ocr": [8, 9], "evenli": 8, "full": [8, 9], "here": [8, 9, 13], "www": 8, "jaid": 8, "ai": [8, 9], "easyocr": 8, "get_read": 8, "videonsfwfilt": [8, 13], "specialcharactersfilt": [8, 13], "videoframestextsimilarityfilt": [8, 13], "kind": [8, 9], "relat": 8, "exampl": 8, "chineseclip": 8, "might": [8, 9], "choic": 8, "imageaspectratiofilt": [8, 13], "333": 8, "audiodurationfilt": [8, 13], "nonnegativeint": [8, 9], "languageidscorefilt": [8, 13], "identif": 8, "suffixfilt": [8, 13], "imagesizefilt": [8, 13], "videowatermarkfilt": [8, 13], "wordsnumfilt": [8, 13], "imagefaceratiofilt": [8, 13], "largest": [8, 10], "flaggedwordfilt": [8, 13], "045": 8, "flagged_words_dir": 8, "flag": 8, "flagged_word": 8, "wordrepetitionfilt": [8, 13], "videomotionscorefilt": [8, 13], "7976931348623157e": 8, "308": 8, "sampling_fp": 8, "sequenc": [8, 9], "rel": 8, "motion": 8, "farneback": 8, "algorith": 8, "opencv": 8, "dens": 8, "optic": 8, "flow": 8, "rate": 8, "frames_per_second": 8, "resiz": [8, 9], "smaller": [8, 9, 10], "rescal": 8, "allow": [8, 9], "longer": 8, "greater": [8, 9, 10], "being": [8, 9], "overrul": 8, "As": 8, "mai": 8, "shorter": [8, 9], "magnitud": 8, "normal": [8, 9], "diagon": 8, "videocaptioningfromaudiomapp": [9, 13], "keep_original_sampl": 9, "caption": 9, "stream": 9, "qwen": 9, "videotaggingfromaudiomapp": [9, 13], "hf_ast": 9, "mit": 9, "ast": 9, "finetun": 9, "audioset": 9, "4593": 9, "spectrogram": 9, "transform": 9, "imagecaptioningfromgpt4vmapp": [9, 13], "descript": 9, "api_kei": 9, "max_token": 9, "temperatur": 9, "system_prompt": 9, "user_prompt": 9, "user_prompt_kei": 9, "gpt": 9, "visison": 9, "reson": 9, "convers": 9, "custom": 9, "api": 9, "authent": 9, "request": 9, "control": 9, "output": 9, "prompt": 9, "guidanc": [9, 13], "rule": [9, 10], "gpt4": 9, "vision": 9, "respons": 9, "guid": 9, "uers_prompt_kei": 9, "punctuationnormalizationmapp": [9, 13], "unicod": 9, "removebibliographymapp": [9, 13], "bibliographi": 9, "end": 9, "latex": 9, "sentencesplitmapp": [9, 13], "videosplitbyscenemapp": [9, 13], "detector": 9, "contentdetector": 9, "27": 9, "min_scene_len": 9, "15": 9, "show_progress": 9, "cut": 9, "scene": 9, "avaliable_detector": 9, "adaptivedetector": 9, "window_width": 9, "min_content_v": 9, "luma_onli": 9, "kernel_s": 9, "video_manag": 9, "min_delta_hsv": 9, "thresholddetector": 9, "fade_bia": 9, "add_final_scen": 9, "block_siz": 9, "scenedetect": 9, "pass": 9, "progress": 9, "cleanipmapp": [9, 13], "repl": 9, "clean": 9, "ipv4": 9, "ipv6": 9, "address": 9, "regular": 9, "express": 9, "search": [9, 13], "replac": 9, "cleanlinksmapp": [9, 13], "ftp": 9, "removeheadermapp": [9, 13], "drop_no_head": 9, "header": 9, "begin": 9, "drop": 9, "removetabletextmapp": [9, 13], "min_col": 9, "from_2_to_20": 9, "max_col": 9, "20": 9, "videoremovewatermarkmapp": [9, 13], "roi_str": 9, "roi_typ": 9, "roi_kei": 9, "min_frame_threshold": 9, "detection_method": 9, "pixel_valu": 9, "region": 9, "x1": 9, "y1": 9, "x2": 9, "y2": 9, "roi": 9, "pixel": 9, "corner": 9, "coordin": 9, "wight": 9, "coodin": 9, "pixel_divers": 9, "useless": 9, "removerepeatsentencesmapp": [9, 13], "ignore_special_charact": 9, "min_repeat_sentence_length": 9, "repeat": 9, "judg": 9, "except": 9, "letter": 9, "imagediffusionmapp": [9, 13], "hf_diffus": 9, "compvi": 9, "stabl": 9, "diffus": 9, "v1": 9, "torch_dtyp": 9, "fp32": 9, "revis": 9, "strength": 9, "guidance_scal": 9, "aug_num": 9, "caption_kei": 9, "hf_img2seq": 9, "blip2": 9, "opt": 9, "7b": 9, "point": 9, "fp16": 9, "bf16": 9, "version": 9, "branch": 9, "commit": 9, "id": 9, "git": 9, "extent": 9, "start": 9, "nois": 9, "higher": 9, "denois": 9, "step": 9, "amount": 9, "num_inference_step": 9, "essenti": 9, "scale": 9, "encourag": 9, "close": 9, "expens": 9, "qualiti": 9, "enabl": 9, "produc": 9, "keep_candidate_mod": 9, "caption_num": 9, "candid": 9, "random_ani": 9, "similar_one_simhash": 9, "batched_op": 9, "both": [9, 10], "suppos": 9, "batch": 9, "b": 9, "denot": 9, "2nb": 9, "nb": 9, "mnb": 9, "otherwis": 9, "imagefaceblurmapp": [9, 13], "blur_typ": 9, "gaussian": 9, "radiu": 9, "blur": 9, "kernel": 9, "videoffmpegwrappedmapp": [9, 13], "filter_nam": 9, "filter_kwarg": 9, "global_arg": 9, "capture_stderr": 9, "overwrite_output": 9, "wrapper": 9, "ffmpeg": 9, "captur": 9, "stderr": 9, "chineseconvertmapp": [9, 13], "s2t": 9, "tradit": 9, "simplifi": 9, "japanes": 9, "kanji": 9, "choos": 9, "t2": 9, "s2tw": 9, "taiwan": 9, "standard": 9, "tw2": 9, "s2hk": 9, "hong": 9, "kong": 9, "variant": 9, "hk2": 9, "s2twp": 9, "taiwanes": 9, "idiom": 9, "tw2sp": 9, "mainland": 9, "t2tw": 9, "tw2t": 9, "hk2t": 9, "t2hk": 9, "t2jp": 9, "ky\u016bjitai": 9, "jp2t": 9, "shinjitai": 9, "nlpcdazhmapp": [9, 13], "sequenti": 9, "replace_similar_word": 9, "replace_homophone_char": 9, "delete_random_char": 9, "swap_random_char": 9, "replace_equivalent_num": 9, "simpli": 9, "nlpcda": 9, "librari": 9, "you": 9, "time": 9, "semant": 9, "significantli": 9, "notic": 9, "combin": 9, "would": 9, "opened_aug_method": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "\u8fd9\u8fb9\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "homophon": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6fd6\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "delet": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a": 9, "swap": 9, "contigu": 9, "\u8fd9\u91cc\u4e00\u5171\u67095\u79cd\u4e0d\u540c\u7684\u6570\u636e\u5f3a\u589e\u65b9\u6cd5": 9, "equival": 9, "represent": 9, "\u8fd9\u91cc\u4e00\u5171\u6709\u4f0d\u79cd\u4e0d\u540c\u7684\u6570\u636e\u589e\u5f3a\u65b9\u6cd5": 9, "imageblurmapp": [9, 13], "p": 9, "blure": 9, "cleancopyrightmapp": [9, 13], "copyright": 9, "comment": 9, "removenonchinesecharacterlmapp": [9, 13], "keep_alphabet": 9, "keep_numb": 9, "keep_punc": 9, "videosplitbykeyframemapp": [9, 13], "get_split_key_fram": 9, "removespecificcharsmapp": [9, 13], "chars_to_remov": 9, "videoresizeaspectratiomapp": [9, 13], "increas": 9, "decreas": 9, "enforc": 9, "abov": 9, "adjust": 9, "dimens": 9, "either": 9, "enlarg": 9, "accept": 9, "cleanhtmlmapp": [9, 13], "whitespacenormalizationmapp": [9, 13], "0x20": 9, "wikipedia": 9, "wiki": 9, "whitespace_charact": 9, "videotaggingfromframesmapp": [9, 13], "removecommentsmapp": [9, 13], "doc_typ": 9, "inlin": 9, "multilin": 9, "expandmacromapp": [9, 13], "expand": 9, "macro": 9, "definit": 9, "bodi": 9, "extractqamapp": [9, 13], "hf_model": 9, "alibaba": 9, "pai": 9, "qwen1_5": 9, "doc2qa": 9, "qa_format": 9, "chatml": 9, "question": 9, "answer": 9, "llama3": 9, "8b": 9, "baichuan2": 9, "4b": 9, "1b8": 9, "0b5": 9, "These": 9, "train": 9, "suitabl": 9, "hugginfac": 9, "interfac": 9, "follow": 9, "\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u4e4c\u5170\u5df4\u6258": 9, "ulaanbaatar": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u96f7\u514b\u96c5\u672a\u514b": 9, "reykjavik": 9, "human": 9, "\u8bf7\u95ee\u8499\u53e4\u56fd\u7684\u9996\u90fd\u662f\u54ea\u91cc": 9, "assist": 9, "\u4f60\u597d": 9, "\u6839\u636e\u63d0\u4f9b\u7684\u4fe1\u606f": 9, "\u51b0\u5c9b\u7684\u9996\u90fd\u662f\u54ea\u91cc\u5462": 9, "imagecaptioningmapp": [9, 13], "prompt_kei": 9, "anoth": 9, "how": 9, "mani": 9, "similar_on": 9, "removewordswithincorrectsubstringsmapp": [9, 13], "substr": 9, "incorrect": 9, "should_keep_word_with_incorrect_substr": 9, "videocaptioningfromvideomapp": [9, 13], "hf_video_blip": 9, "kpyu": 9, "ego4d": 9, "videocaptioningfromsummarizermapp": [9, 13], "hf_summar": 9, "consider_video_caption_from_video": 9, "consider_video_caption_from_audio": 9, "consider_video_caption_from_fram": 9, "consider_video_tags_from_audio": 9, "consider_video_tags_from_fram": 9, "vid_cap_from_vid_arg": 9, "vid_cap_from_frm_arg": 9, "vid_tag_from_aud_arg": 9, "vid_tag_from_frm_arg": 9, "keep_tag_num": 9, "summar": 9, "directli": 9, "too": 9, "bring": 9, "influenc": 9, "frequent": 9, "fixunicodemapp": [9, 13], "fix": 9, "error": 9, "form": 9, "nfc": 9, "nfkc": 9, "nfd": 9, "nfkd": 9, "nlpaugenmapp": [9, 13], "delete_random_word": 9, "swap_random_word": 9, "spelling_error_word": 9, "split_random_word": 9, "keyboard_error_char": 9, "ocr_error_char": 9, "insert_random_char": 9, "nlpaug": 9, "love": 9, "llm": 9, "simul": 9, "spell": 9, "ll": 9, "keyboard": 9, "ov4": 9, "10ve": 9, "oe": 9, "ovl": 9, "insert": 9, "lkove": 9, "videocaptioningfromframesmapp": [9, 13], "removelongwordsmapp": [9, 13], "long": 9, "should_keep_long_word": 9, "videoresizeresolutionmapp": [9, 13], "force_original_aspect_ratio": 9, "disabl": 9, "force_divisible_bi": 9, "leav": 9, "super": 9, "deep": 9, "learn": 9, "futur": 9, "necessari": 9, "ensur": 9, "divis": 9, "integ": 9, "even": 9, "cleanemailmapp": [9, 13], "email": 9, "replacecontentmapp": [9, 13], "design": 9, "audioffmpegwrappedmapp": [9, 13], "videosplitbydurationmapp": [9, 13], "split_dur": 9, "min_last_split_dur": 9, "discard": 9, "split_videos_by_dur": 9, "videofaceblurmapp": [9, 13], "frequencyspecifiedfieldselector": [10, 13], "top_ratio": 10, "topk": 10, "sort": 10, "frequenc": 10, "descend": 10, "randomselector": [10, 13], "select_ratio": 10, "select_num": 10, "rangespecifiedfieldselector": [10, 13], "lower_percentil": 10, "upper_percentil": 10, "lower_rank": 10, "upper_rank": 10, "smallest": 10, "bound": 10, "upper": 10, "topkspecifiedfieldselector": [10, 13], "give": 13, "kdd": 13, "24": 13, "modal": 13, "foundat": 13, "practic": 13, "see": 13, "detail": 13, "data_juic": 13, "core": 13, "index": 13, "page": 13}, "objects": {"": [[0, 0, 0, "-", "data_juicer"]], "data_juicer": [[1, 0, 0, "-", "analysis"], [2, 0, 0, "-", "config"], [3, 0, 0, "-", "core"], [0, 3, 1, "", "cuda_device_count"], [4, 0, 0, "-", "format"], [0, 3, 1, "", "is_cuda_available"], [5, 0, 0, "-", "ops"], [11, 0, 0, "-", "tools"], [12, 0, 0, "-", "utils"]], "data_juicer.analysis": [[1, 1, 1, "", "ColumnWiseAnalysis"], [1, 1, 1, "", "DiversityAnalysis"], [1, 1, 1, "", "OverallAnalysis"]], "data_juicer.analysis.ColumnWiseAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "draw_box"], [1, 2, 1, "", "draw_hist"]], "data_juicer.analysis.DiversityAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "compute"]], "data_juicer.analysis.OverallAnalysis": [[1, 2, 1, "", "__init__"], [1, 2, 1, "", "analyze"], [1, 2, 1, "", "refine_single_column"]], "data_juicer.config": [[2, 3, 1, "", "export_config"], [2, 3, 1, "", "init_configs"], [2, 3, 1, "", "merge_config"]], "data_juicer.core": [[3, 1, 1, "", "Analyzer"], [3, 1, 1, "", "Executor"], [3, 1, 1, "", "Exporter"], [3, 1, 1, "", "NestedDataset"], [3, 1, 1, "", "Tracer"]], "data_juicer.core.Analyzer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"]], "data_juicer.core.Executor": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "run"], [3, 2, 1, "", "sample_data"]], "data_juicer.core.Exporter": [[3, 4, 1, "", "GiB"], [3, 4, 1, "", "KiB"], [3, 4, 1, "", "MiB"], [3, 4, 1, "", "TiB"], [3, 2, 1, "", "__init__"], [3, 2, 1, "", "export"], [3, 2, 1, "", "export_compute_stats"], [3, 2, 1, "", "to_json"], [3, 2, 1, "", "to_jsonl"], [3, 2, 1, "", "to_parquet"]], "data_juicer.core.NestedDataset": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "add_column"], [3, 2, 1, "", "cleanup_cache_files"], [3, 2, 1, "", "filter"], [3, 2, 1, "", "from_dict"], [3, 2, 1, "", "map"], [3, 2, 1, "", "process"], [3, 2, 1, "", "remove_columns"], [3, 2, 1, "", "select"], [3, 2, 1, "", "select_columns"]], "data_juicer.core.Tracer": [[3, 2, 1, "", "__init__"], [3, 2, 1, "", "trace_batch_mapper"], [3, 2, 1, "", "trace_deduplicator"], [3, 2, 1, "", "trace_filter"], [3, 2, 1, "", "trace_mapper"]], "data_juicer.format": [[4, 1, 1, "", "CsvFormatter"], [4, 1, 1, "", "JsonFormatter"], [4, 1, 1, "", "LocalFormatter"], [4, 1, 1, "", "MixtureFormatter"], [4, 1, 1, "", "ParquetFormatter"], [4, 1, 1, "", "RemoteFormatter"], [4, 1, 1, "", "TextFormatter"], [4, 1, 1, "", "TsvFormatter"], [4, 3, 1, "", "load_formatter"]], "data_juicer.format.CsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.JsonFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.LocalFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.MixtureFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"], [4, 2, 1, "", "random_sample"]], "data_juicer.format.ParquetFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.format.RemoteFormatter": [[4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TextFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"], [4, 2, 1, "", "load_dataset"]], "data_juicer.format.TsvFormatter": [[4, 4, 1, "", "SUFFIXES"], [4, 2, 1, "", "__init__"]], "data_juicer.ops": [[5, 1, 1, "", "Deduplicator"], [5, 1, 1, "", "Filter"], [5, 1, 1, "", "Mapper"], [5, 1, 1, "", "Selector"], [6, 0, 0, "-", "common"], [7, 0, 0, "-", "deduplicator"], [8, 0, 0, "-", "filter"], [5, 3, 1, "", "load_ops"], [9, 0, 0, "-", "mapper"], [10, 0, 0, "-", "selector"]], "data_juicer.ops.Deduplicator": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_hash"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Filter": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "compute_stats"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Mapper": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.Selector": [[5, 2, 1, "", "__init__"], [5, 2, 1, "", "process"], [5, 2, 1, "", "run"]], "data_juicer.ops.common": [[6, 3, 1, "", "get_sentences_from_document"], [6, 3, 1, "", "get_words_from_document"], [6, 3, 1, "", "merge_on_whitespace_tab_newline"], [6, 3, 1, "", "split_on_newline_tab_whitespace"], [6, 3, 1, "", "split_on_whitespace"], [6, 3, 1, "", "strip"], [6, 3, 1, "", "words_augmentation"], [6, 3, 1, "", "words_refinement"]], "data_juicer.ops.deduplicator": [[7, 1, 1, "", "DocumentDeduplicator"], [7, 1, 1, "", "DocumentMinhashDeduplicator"], [7, 1, 1, "", "DocumentSimhashDeduplicator"], [7, 1, 1, "", "ImageDeduplicator"], [7, 1, 1, "", "RayBasicDeduplicator"], [7, 1, 1, "", "RayDocumentDeduplicator"], [7, 1, 1, "", "RayImageDeduplicator"], [7, 1, 1, "", "RayVideoDeduplicator"], [7, 1, 1, "", "VideoDeduplicator"]], "data_juicer.ops.deduplicator.DocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.ImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayBasicDeduplicator": [[7, 4, 1, "", "EMPTY_HASH_VALUE"], [7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"], [7, 2, 1, "", "compute_stats"], [7, 2, 1, "", "process"]], "data_juicer.ops.deduplicator.RayDocumentDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayImageDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.RayVideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "calculate_hash"]], "data_juicer.ops.deduplicator.VideoDeduplicator": [[7, 2, 1, "", "__init__"], [7, 2, 1, "", "compute_hash"], [7, 2, 1, "", "process"]], "data_juicer.ops.filter": [[8, 1, 1, "", "AlphanumericFilter"], [8, 1, 1, "", "AudioDurationFilter"], [8, 1, 1, "", "AudioNMFSNRFilter"], [8, 1, 1, "", "AudioSizeFilter"], [8, 1, 1, "", "AverageLineLengthFilter"], [8, 1, 1, "", "CharacterRepetitionFilter"], [8, 1, 1, "", "FlaggedWordFilter"], [8, 1, 1, "", "ImageAestheticsFilter"], [8, 1, 1, "", "ImageAspectRatioFilter"], [8, 1, 1, "", "ImageFaceRatioFilter"], [8, 1, 1, "", "ImageNSFWFilter"], [8, 1, 1, "", "ImageShapeFilter"], [8, 1, 1, "", "ImageSizeFilter"], [8, 1, 1, "", "ImageTextMatchingFilter"], [8, 1, 1, "", "ImageTextSimilarityFilter"], [8, 1, 1, "", "ImageWatermarkFilter"], [8, 1, 1, "", "LanguageIDScoreFilter"], [8, 1, 1, "", "MaximumLineLengthFilter"], [8, 1, 1, "", "PerplexityFilter"], [8, 1, 1, "", "PhraseGroundingRecallFilter"], [8, 1, 1, "", "SpecialCharactersFilter"], [8, 1, 1, "", "SpecifiedFieldFilter"], [8, 1, 1, "", "SpecifiedNumericFieldFilter"], [8, 1, 1, "", "StopWordsFilter"], [8, 1, 1, "", "SuffixFilter"], [8, 1, 1, "", "TextActionFilter"], [8, 1, 1, "", "TextEntityDependencyFilter"], [8, 1, 1, "", "TextLengthFilter"], [8, 1, 1, "", "TokenNumFilter"], [8, 1, 1, "", "VideoAestheticsFilter"], [8, 1, 1, "", "VideoAspectRatioFilter"], [8, 1, 1, "", "VideoDurationFilter"], [8, 1, 1, "", "VideoFramesTextSimilarityFilter"], [8, 1, 1, "", "VideoMotionScoreFilter"], [8, 1, 1, "", "VideoNSFWFilter"], [8, 1, 1, "", "VideoOcrAreaRatioFilter"], [8, 1, 1, "", "VideoResolutionFilter"], [8, 1, 1, "", "VideoTaggingFromFramesFilter"], [8, 1, 1, "", "VideoWatermarkFilter"], [8, 1, 1, "", "WordRepetitionFilter"], [8, 1, 1, "", "WordsNumFilter"]], "data_juicer.ops.filter.AlphanumericFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioNMFSNRFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AudioSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.AverageLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.CharacterRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.FlaggedWordFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageFaceRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageShapeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageSizeFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextMatchingFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.ImageWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.LanguageIDScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.MaximumLineLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PerplexityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.PhraseGroundingRecallFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecialCharactersFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SpecifiedNumericFieldFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.StopWordsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.SuffixFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextActionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextEntityDependencyFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TextLengthFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.TokenNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAestheticsFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoAspectRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoDurationFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoFramesTextSimilarityFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoMotionScoreFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoNSFWFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoOcrAreaRatioFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "get_reader"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoResolutionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoTaggingFromFramesFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.VideoWatermarkFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordRepetitionFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.filter.WordsNumFilter": [[8, 2, 1, "", "__init__"], [8, 2, 1, "", "compute_stats"], [8, 2, 1, "", "process"]], "data_juicer.ops.mapper": [[9, 1, 1, "", "AudioFFmpegWrappedMapper"], [9, 1, 1, "", "ChineseConvertMapper"], [9, 1, 1, "", "CleanCopyrightMapper"], [9, 1, 1, "", "CleanEmailMapper"], [9, 1, 1, "", "CleanHtmlMapper"], [9, 1, 1, "", "CleanIpMapper"], [9, 1, 1, "", "CleanLinksMapper"], [9, 1, 1, "", "ExpandMacroMapper"], [9, 1, 1, "", "ExtractQAMapper"], [9, 1, 1, "", "FixUnicodeMapper"], [9, 1, 1, "", "ImageBlurMapper"], [9, 1, 1, "", "ImageCaptioningFromGPT4VMapper"], [9, 1, 1, "", "ImageCaptioningMapper"], [9, 1, 1, "", "ImageDiffusionMapper"], [9, 1, 1, "", "ImageFaceBlurMapper"], [9, 1, 1, "", "NlpaugEnMapper"], [9, 1, 1, "", "NlpcdaZhMapper"], [9, 1, 1, "", "PunctuationNormalizationMapper"], [9, 1, 1, "", "RemoveBibliographyMapper"], [9, 1, 1, "", "RemoveCommentsMapper"], [9, 1, 1, "", "RemoveHeaderMapper"], [9, 1, 1, "", "RemoveLongWordsMapper"], [9, 1, 1, "", "RemoveNonChineseCharacterlMapper"], [9, 1, 1, "", "RemoveRepeatSentencesMapper"], [9, 1, 1, "", "RemoveSpecificCharsMapper"], [9, 1, 1, "", "RemoveTableTextMapper"], [9, 1, 1, "", "RemoveWordsWithIncorrectSubstringsMapper"], [9, 1, 1, "", "ReplaceContentMapper"], [9, 1, 1, "", "SentenceSplitMapper"], [9, 1, 1, "", "VideoCaptioningFromAudioMapper"], [9, 1, 1, "", "VideoCaptioningFromFramesMapper"], [9, 1, 1, "", "VideoCaptioningFromSummarizerMapper"], [9, 1, 1, "", "VideoCaptioningFromVideoMapper"], [9, 1, 1, "", "VideoFFmpegWrappedMapper"], [9, 1, 1, "", "VideoFaceBlurMapper"], [9, 1, 1, "", "VideoRemoveWatermarkMapper"], [9, 1, 1, "", "VideoResizeAspectRatioMapper"], [9, 1, 1, "", "VideoResizeResolutionMapper"], [9, 1, 1, "", "VideoSplitByDurationMapper"], [9, 1, 1, "", "VideoSplitByKeyFrameMapper"], [9, 1, 1, "", "VideoSplitBySceneMapper"], [9, 1, 1, "", "VideoTaggingFromAudioMapper"], [9, 1, 1, "", "VideoTaggingFromFramesMapper"], [9, 1, 1, "", "WhitespaceNormalizationMapper"]], "data_juicer.ops.mapper.AudioFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ChineseConvertMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanCopyrightMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanEmailMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanHtmlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanIpMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.CleanLinksMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExpandMacroMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ExtractQAMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.FixUnicodeMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageCaptioningMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageDiffusionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.ImageFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpaugEnMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.NlpcdaZhMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.PunctuationNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveBibliographyMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveCommentsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveHeaderMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveLongWordsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_long_word"]], "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveRepeatSentencesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveSpecificCharsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveTableTextMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "should_keep_word_with_incorrect_substrings"]], "data_juicer.ops.mapper.ReplaceContentMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.SentenceSplitMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFFmpegWrappedMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoFaceBlurMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoRemoveWatermarkMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeAspectRatioMapper": [[9, 4, 1, "", "STRATEGY"], [9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoResizeResolutionMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitByDurationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"], [9, 2, 1, "", "split_videos_by_duration"]], "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "get_split_key_frame"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoSplitBySceneMapper": [[9, 2, 1, "", "__init__"], [9, 4, 1, "", "avaliable_detectors"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromAudioMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.VideoTaggingFromFramesMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.mapper.WhitespaceNormalizationMapper": [[9, 2, 1, "", "__init__"], [9, 2, 1, "", "process"]], "data_juicer.ops.selector": [[10, 1, 1, "", "FrequencySpecifiedFieldSelector"], [10, 1, 1, "", "RandomSelector"], [10, 1, 1, "", "RangeSpecifiedFieldSelector"], [10, 1, 1, "", "TopkSpecifiedFieldSelector"]], "data_juicer.ops.selector.FrequencySpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RandomSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.RangeSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]], "data_juicer.ops.selector.TopkSpecifiedFieldSelector": [[10, 2, 1, "", "__init__"], [10, 2, 1, "", "process"]]}, "objtypes": {"0": "py:module", "1": "py:class", "2": "py:method", "3": "py:function", "4": "py:attribute"}, "objnames": {"0": ["py", "module", "Python module"], "1": ["py", "class", "Python class"], "2": ["py", "method", "Python method"], "3": ["py", "function", "Python function"], "4": ["py", "attribute", "Python attribute"]}, "titleterms": {"data_juic": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14], "analysi": 1, "config": 2, "core": 3, "format": 4, "op": [5, 6, 7, 8, 9, 10], "common": 6, "dedupl": 7, "filter": 8, "mapper": 9, "selector": 10, "tool": 11, "util": 12, "welcom": 13, "data": 13, "juicer": 13, "": 13, "document": 13, "tutori": 13, "api": 13, "refer": 13, "indic": 13, "tabl": 13}, "envversion": {"sphinx.domains.c": 3, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 9, "sphinx.domains.index": 1, "sphinx.domains.javascript": 3, "sphinx.domains.math": 2, "sphinx.domains.python": 4, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx.ext.viewcode": 1, "sphinx": 58}, "alltitles": {"data_juicer": [[0, "module-data_juicer"], [14, "data-juicer"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]], "Welcome to data-juicer\u2019s documentation!": [[13, "welcome-to-data-juicer-s-documentation"]], "Tutorial": [[13, "tutorial"]], "API Reference": [[13, null]], "Indices and Tables": [[13, "indices-and-tables"]]}, "indexentries": {"cuda_device_count() (in module data_juicer)": [[0, "data_juicer.cuda_device_count"]], "data_juicer": [[0, "module-data_juicer"]], "is_cuda_available() (in module data_juicer)": [[0, "data_juicer.is_cuda_available"]], "module": [[0, "module-data_juicer"], [1, "module-data_juicer.analysis"], [2, "module-data_juicer.config"], [3, "module-data_juicer.core"], [4, "module-data_juicer.format"], [5, "module-data_juicer.ops"], [6, "module-data_juicer.ops.common"], [7, "module-data_juicer.ops.deduplicator"], [8, "module-data_juicer.ops.filter"], [9, "module-data_juicer.ops.mapper"], [10, "module-data_juicer.ops.selector"], [11, "module-data_juicer.tools"], [12, "module-data_juicer.utils"]], "columnwiseanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.ColumnWiseAnalysis"]], "diversityanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.DiversityAnalysis"]], "overallanalysis (class in data_juicer.analysis)": [[1, "data_juicer.analysis.OverallAnalysis"]], "__init__() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.__init__"]], "__init__() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.__init__"]], "__init__() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.__init__"]], "analyze() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.analyze"]], "analyze() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.analyze"]], "analyze() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.analyze"]], "compute() (data_juicer.analysis.diversityanalysis method)": [[1, "data_juicer.analysis.DiversityAnalysis.compute"]], "data_juicer.analysis": [[1, "module-data_juicer.analysis"]], "draw_box() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_box"]], "draw_hist() (data_juicer.analysis.columnwiseanalysis method)": [[1, "data_juicer.analysis.ColumnWiseAnalysis.draw_hist"]], "refine_single_column() (data_juicer.analysis.overallanalysis method)": [[1, "data_juicer.analysis.OverallAnalysis.refine_single_column"]], "data_juicer.config": [[2, "module-data_juicer.config"]], "export_config() (in module data_juicer.config)": [[2, "data_juicer.config.export_config"]], "init_configs() (in module data_juicer.config)": [[2, "data_juicer.config.init_configs"]], "merge_config() (in module data_juicer.config)": [[2, "data_juicer.config.merge_config"]], "analyzer (class in data_juicer.core)": [[3, "data_juicer.core.Analyzer"]], "executor (class in data_juicer.core)": [[3, "data_juicer.core.Executor"]], "exporter (class in data_juicer.core)": [[3, "data_juicer.core.Exporter"]], "gib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.GiB"]], "kib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.KiB"]], "mib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.MiB"]], "nesteddataset (class in data_juicer.core)": [[3, "data_juicer.core.NestedDataset"]], "tib (data_juicer.core.exporter attribute)": [[3, "data_juicer.core.Exporter.TiB"]], "tracer (class in data_juicer.core)": [[3, "data_juicer.core.Tracer"]], "__init__() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.__init__"]], "__init__() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.__init__"]], "__init__() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.__init__"]], "__init__() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.__init__"]], "__init__() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.__init__"]], "add_column() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.add_column"]], "cleanup_cache_files() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.cleanup_cache_files"]], "data_juicer.core": [[3, "module-data_juicer.core"]], "export() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export"]], "export_compute_stats() (data_juicer.core.exporter method)": [[3, "data_juicer.core.Exporter.export_compute_stats"]], "filter() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.filter"]], "from_dict() (data_juicer.core.nesteddataset class method)": [[3, "data_juicer.core.NestedDataset.from_dict"]], "map() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.map"]], "process() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.process"]], "remove_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.remove_columns"]], "run() (data_juicer.core.analyzer method)": [[3, "data_juicer.core.Analyzer.run"]], "run() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.run"]], "sample_data() (data_juicer.core.executor method)": [[3, "data_juicer.core.Executor.sample_data"]], "select() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select"]], "select_columns() (data_juicer.core.nesteddataset method)": [[3, "data_juicer.core.NestedDataset.select_columns"]], "to_json() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_json"]], "to_jsonl() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_jsonl"]], "to_parquet() (data_juicer.core.exporter static method)": [[3, "data_juicer.core.Exporter.to_parquet"]], "trace_batch_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_batch_mapper"]], "trace_deduplicator() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_deduplicator"]], "trace_filter() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_filter"]], "trace_mapper() (data_juicer.core.tracer method)": [[3, "data_juicer.core.Tracer.trace_mapper"]], "csvformatter (class in data_juicer.format)": [[4, "data_juicer.format.CsvFormatter"]], "jsonformatter (class in data_juicer.format)": [[4, "data_juicer.format.JsonFormatter"]], "localformatter (class in data_juicer.format)": [[4, "data_juicer.format.LocalFormatter"]], "mixtureformatter (class in data_juicer.format)": [[4, "data_juicer.format.MixtureFormatter"]], "parquetformatter (class in data_juicer.format)": [[4, "data_juicer.format.ParquetFormatter"]], "remoteformatter (class in data_juicer.format)": [[4, "data_juicer.format.RemoteFormatter"]], "suffixes (data_juicer.format.csvformatter attribute)": [[4, "data_juicer.format.CsvFormatter.SUFFIXES"]], "suffixes (data_juicer.format.jsonformatter attribute)": [[4, "data_juicer.format.JsonFormatter.SUFFIXES"]], "suffixes (data_juicer.format.parquetformatter attribute)": [[4, "data_juicer.format.ParquetFormatter.SUFFIXES"]], "suffixes (data_juicer.format.textformatter attribute)": [[4, "data_juicer.format.TextFormatter.SUFFIXES"]], "suffixes (data_juicer.format.tsvformatter attribute)": [[4, "data_juicer.format.TsvFormatter.SUFFIXES"]], "textformatter (class in data_juicer.format)": [[4, "data_juicer.format.TextFormatter"]], "tsvformatter (class in data_juicer.format)": [[4, "data_juicer.format.TsvFormatter"]], "__init__() (data_juicer.format.csvformatter method)": [[4, "data_juicer.format.CsvFormatter.__init__"]], "__init__() (data_juicer.format.jsonformatter method)": [[4, "data_juicer.format.JsonFormatter.__init__"]], "__init__() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.__init__"]], "__init__() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.__init__"]], "__init__() (data_juicer.format.parquetformatter method)": [[4, "data_juicer.format.ParquetFormatter.__init__"]], "__init__() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.__init__"]], "__init__() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.__init__"]], "__init__() (data_juicer.format.tsvformatter method)": [[4, "data_juicer.format.TsvFormatter.__init__"]], "data_juicer.format": [[4, "module-data_juicer.format"]], "load_dataset() (data_juicer.format.localformatter method)": [[4, "data_juicer.format.LocalFormatter.load_dataset"]], "load_dataset() (data_juicer.format.mixtureformatter method)": [[4, "data_juicer.format.MixtureFormatter.load_dataset"]], "load_dataset() (data_juicer.format.remoteformatter method)": [[4, "data_juicer.format.RemoteFormatter.load_dataset"]], "load_dataset() (data_juicer.format.textformatter method)": [[4, "data_juicer.format.TextFormatter.load_dataset"]], "load_formatter() (in module data_juicer.format)": [[4, "data_juicer.format.load_formatter"]], "random_sample() (data_juicer.format.mixtureformatter class method)": [[4, "data_juicer.format.MixtureFormatter.random_sample"]], "deduplicator (class in data_juicer.ops)": [[5, "data_juicer.ops.Deduplicator"]], "filter (class in data_juicer.ops)": [[5, "data_juicer.ops.Filter"]], "mapper (class in data_juicer.ops)": [[5, "data_juicer.ops.Mapper"]], "selector (class in data_juicer.ops)": [[5, "data_juicer.ops.Selector"]], "__init__() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.__init__"]], "__init__() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.__init__"]], "__init__() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.__init__"]], "__init__() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.__init__"]], "compute_hash() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.compute_stats"]], "data_juicer.ops": [[5, "module-data_juicer.ops"]], "load_ops() (in module data_juicer.ops)": [[5, "data_juicer.ops.load_ops"]], "process() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.process"]], "process() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.process"]], "process() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.process"]], "process() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.process"]], "run() (data_juicer.ops.deduplicator method)": [[5, "data_juicer.ops.Deduplicator.run"]], "run() (data_juicer.ops.filter method)": [[5, "data_juicer.ops.Filter.run"]], "run() (data_juicer.ops.mapper method)": [[5, "data_juicer.ops.Mapper.run"]], "run() (data_juicer.ops.selector method)": [[5, "data_juicer.ops.Selector.run"]], "data_juicer.ops.common": [[6, "module-data_juicer.ops.common"]], "get_sentences_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_sentences_from_document"]], "get_words_from_document() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.get_words_from_document"]], "merge_on_whitespace_tab_newline() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.merge_on_whitespace_tab_newline"]], "split_on_newline_tab_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_newline_tab_whitespace"]], "split_on_whitespace() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.split_on_whitespace"]], "strip() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.strip"]], "words_augmentation() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_augmentation"]], "words_refinement() (in module data_juicer.ops.common)": [[6, "data_juicer.ops.common.words_refinement"]], "documentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator"]], "documentminhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator"]], "documentsimhashdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator"]], "empty_hash_value (data_juicer.ops.deduplicator.raybasicdeduplicator attribute)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.EMPTY_HASH_VALUE"]], "imagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator"]], "raybasicdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator"]], "raydocumentdeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator"]], "rayimagededuplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator"]], "rayvideodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator"]], "videodeduplicator (class in data_juicer.ops.deduplicator)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator"]], "__init__() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.__init__"]], "__init__() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.__init__"]], "calculate_hash() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.raydocumentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayDocumentDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayimagededuplicator method)": [[7, "data_juicer.ops.deduplicator.RayImageDeduplicator.calculate_hash"]], "calculate_hash() (data_juicer.ops.deduplicator.rayvideodeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayVideoDeduplicator.calculate_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.compute_hash"]], "compute_hash() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.compute_hash"]], "compute_stats() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.compute_stats"]], "data_juicer.ops.deduplicator": [[7, "module-data_juicer.ops.deduplicator"]], "process() (data_juicer.ops.deduplicator.documentdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentminhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentMinhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.documentsimhashdeduplicator method)": [[7, "data_juicer.ops.deduplicator.DocumentSimhashDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.imagededuplicator method)": [[7, "data_juicer.ops.deduplicator.ImageDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.raybasicdeduplicator method)": [[7, "data_juicer.ops.deduplicator.RayBasicDeduplicator.process"]], "process() (data_juicer.ops.deduplicator.videodeduplicator method)": [[7, "data_juicer.ops.deduplicator.VideoDeduplicator.process"]], "alphanumericfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AlphanumericFilter"]], "audiodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioDurationFilter"]], "audionmfsnrfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter"]], "audiosizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AudioSizeFilter"]], "averagelinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter"]], "characterrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter"]], "flaggedwordfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.FlaggedWordFilter"]], "imageaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter"]], "imageaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter"]], "imagefaceratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter"]], "imagensfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageNSFWFilter"]], "imageshapefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageShapeFilter"]], "imagesizefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageSizeFilter"]], "imagetextmatchingfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter"]], "imagetextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter"]], "imagewatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter"]], "languageidscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter"]], "maximumlinelengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter"]], "perplexityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PerplexityFilter"]], "phrasegroundingrecallfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter"]], "specialcharactersfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter"]], "specifiedfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter"]], "specifiednumericfieldfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter"]], "stopwordsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.StopWordsFilter"]], "suffixfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.SuffixFilter"]], "textactionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextActionFilter"]], "textentitydependencyfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter"]], "textlengthfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TextLengthFilter"]], "tokennumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.TokenNumFilter"]], "videoaestheticsfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter"]], "videoaspectratiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter"]], "videodurationfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoDurationFilter"]], "videoframestextsimilarityfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter"]], "videomotionscorefilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter"]], "videonsfwfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoNSFWFilter"]], "videoocrarearatiofilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter"]], "videoresolutionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoResolutionFilter"]], "videotaggingfromframesfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter"]], "videowatermarkfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter"]], "wordrepetitionfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordRepetitionFilter"]], "wordsnumfilter (class in data_juicer.ops.filter)": [[8, "data_juicer.ops.filter.WordsNumFilter"]], "__init__() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.__init__"]], "__init__() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.__init__"]], "__init__() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.__init__"]], "__init__() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.__init__"]], "__init__() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.__init__"]], "__init__() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.__init__"]], "__init__() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.__init__"]], "__init__() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.__init__"]], "__init__() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.__init__"]], "__init__() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.__init__"]], "__init__() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.__init__"]], "__init__() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.__init__"]], "__init__() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.__init__"]], "__init__() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.__init__"]], "__init__() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.__init__"]], "__init__() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.__init__"]], "compute_stats() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.compute_stats"]], "compute_stats() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.compute_stats"]], "data_juicer.ops.filter": [[8, "module-data_juicer.ops.filter"]], "get_reader() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.get_reader"]], "process() (data_juicer.ops.filter.alphanumericfilter method)": [[8, "data_juicer.ops.filter.AlphanumericFilter.process"]], "process() (data_juicer.ops.filter.audiodurationfilter method)": [[8, "data_juicer.ops.filter.AudioDurationFilter.process"]], "process() (data_juicer.ops.filter.audionmfsnrfilter method)": [[8, "data_juicer.ops.filter.AudioNMFSNRFilter.process"]], "process() (data_juicer.ops.filter.audiosizefilter method)": [[8, "data_juicer.ops.filter.AudioSizeFilter.process"]], "process() (data_juicer.ops.filter.averagelinelengthfilter method)": [[8, "data_juicer.ops.filter.AverageLineLengthFilter.process"]], "process() (data_juicer.ops.filter.characterrepetitionfilter method)": [[8, "data_juicer.ops.filter.CharacterRepetitionFilter.process"]], "process() (data_juicer.ops.filter.flaggedwordfilter method)": [[8, "data_juicer.ops.filter.FlaggedWordFilter.process"]], "process() (data_juicer.ops.filter.imageaestheticsfilter method)": [[8, "data_juicer.ops.filter.ImageAestheticsFilter.process"]], "process() (data_juicer.ops.filter.imageaspectratiofilter method)": [[8, "data_juicer.ops.filter.ImageAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.imagefaceratiofilter method)": [[8, "data_juicer.ops.filter.ImageFaceRatioFilter.process"]], "process() (data_juicer.ops.filter.imagensfwfilter method)": [[8, "data_juicer.ops.filter.ImageNSFWFilter.process"]], "process() (data_juicer.ops.filter.imageshapefilter method)": [[8, "data_juicer.ops.filter.ImageShapeFilter.process"]], "process() (data_juicer.ops.filter.imagesizefilter method)": [[8, "data_juicer.ops.filter.ImageSizeFilter.process"]], "process() (data_juicer.ops.filter.imagetextmatchingfilter method)": [[8, "data_juicer.ops.filter.ImageTextMatchingFilter.process"]], "process() (data_juicer.ops.filter.imagetextsimilarityfilter method)": [[8, "data_juicer.ops.filter.ImageTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.imagewatermarkfilter method)": [[8, "data_juicer.ops.filter.ImageWatermarkFilter.process"]], "process() (data_juicer.ops.filter.languageidscorefilter method)": [[8, "data_juicer.ops.filter.LanguageIDScoreFilter.process"]], "process() (data_juicer.ops.filter.maximumlinelengthfilter method)": [[8, "data_juicer.ops.filter.MaximumLineLengthFilter.process"]], "process() (data_juicer.ops.filter.perplexityfilter method)": [[8, "data_juicer.ops.filter.PerplexityFilter.process"]], "process() (data_juicer.ops.filter.phrasegroundingrecallfilter method)": [[8, "data_juicer.ops.filter.PhraseGroundingRecallFilter.process"]], "process() (data_juicer.ops.filter.specialcharactersfilter method)": [[8, "data_juicer.ops.filter.SpecialCharactersFilter.process"]], "process() (data_juicer.ops.filter.specifiedfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedFieldFilter.process"]], "process() (data_juicer.ops.filter.specifiednumericfieldfilter method)": [[8, "data_juicer.ops.filter.SpecifiedNumericFieldFilter.process"]], "process() (data_juicer.ops.filter.stopwordsfilter method)": [[8, "data_juicer.ops.filter.StopWordsFilter.process"]], "process() (data_juicer.ops.filter.suffixfilter method)": [[8, "data_juicer.ops.filter.SuffixFilter.process"]], "process() (data_juicer.ops.filter.textactionfilter method)": [[8, "data_juicer.ops.filter.TextActionFilter.process"]], "process() (data_juicer.ops.filter.textentitydependencyfilter method)": [[8, "data_juicer.ops.filter.TextEntityDependencyFilter.process"]], "process() (data_juicer.ops.filter.textlengthfilter method)": [[8, "data_juicer.ops.filter.TextLengthFilter.process"]], "process() (data_juicer.ops.filter.tokennumfilter method)": [[8, "data_juicer.ops.filter.TokenNumFilter.process"]], "process() (data_juicer.ops.filter.videoaestheticsfilter method)": [[8, "data_juicer.ops.filter.VideoAestheticsFilter.process"]], "process() (data_juicer.ops.filter.videoaspectratiofilter method)": [[8, "data_juicer.ops.filter.VideoAspectRatioFilter.process"]], "process() (data_juicer.ops.filter.videodurationfilter method)": [[8, "data_juicer.ops.filter.VideoDurationFilter.process"]], "process() (data_juicer.ops.filter.videoframestextsimilarityfilter method)": [[8, "data_juicer.ops.filter.VideoFramesTextSimilarityFilter.process"]], "process() (data_juicer.ops.filter.videomotionscorefilter method)": [[8, "data_juicer.ops.filter.VideoMotionScoreFilter.process"]], "process() (data_juicer.ops.filter.videonsfwfilter method)": [[8, "data_juicer.ops.filter.VideoNSFWFilter.process"]], "process() (data_juicer.ops.filter.videoocrarearatiofilter method)": [[8, "data_juicer.ops.filter.VideoOcrAreaRatioFilter.process"]], "process() (data_juicer.ops.filter.videoresolutionfilter method)": [[8, "data_juicer.ops.filter.VideoResolutionFilter.process"]], "process() (data_juicer.ops.filter.videotaggingfromframesfilter method)": [[8, "data_juicer.ops.filter.VideoTaggingFromFramesFilter.process"]], "process() (data_juicer.ops.filter.videowatermarkfilter method)": [[8, "data_juicer.ops.filter.VideoWatermarkFilter.process"]], "process() (data_juicer.ops.filter.wordrepetitionfilter method)": [[8, "data_juicer.ops.filter.WordRepetitionFilter.process"]], "process() (data_juicer.ops.filter.wordsnumfilter method)": [[8, "data_juicer.ops.filter.WordsNumFilter.process"]], "audioffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper"]], "chineseconvertmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper"]], "cleancopyrightmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper"]], "cleanemailmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanEmailMapper"]], "cleanhtmlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper"]], "cleanipmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanIpMapper"]], "cleanlinksmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.CleanLinksMapper"]], "expandmacromapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper"]], "extractqamapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ExtractQAMapper"]], "fixunicodemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper"]], "imageblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageBlurMapper"]], "imagecaptioningfromgpt4vmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper"]], "imagecaptioningmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper"]], "imagediffusionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper"]], "imagefaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper"]], "nlpaugenmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper"]], "nlpcdazhmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper"]], "punctuationnormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper"]], "removebibliographymapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper"]], "removecommentsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper"]], "removeheadermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper"]], "removelongwordsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper"]], "removenonchinesecharacterlmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper"]], "removerepeatsentencesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper"]], "removespecificcharsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper"]], "removetabletextmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper"]], "removewordswithincorrectsubstringsmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper"]], "replacecontentmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper"]], "strategy (data_juicer.ops.mapper.videoresizeaspectratiomapper attribute)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.STRATEGY"]], "sentencesplitmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper"]], "videocaptioningfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper"]], "videocaptioningfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper"]], "videocaptioningfromsummarizermapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper"]], "videocaptioningfromvideomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper"]], "videoffmpegwrappedmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper"]], "videofaceblurmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper"]], "videoremovewatermarkmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper"]], "videoresizeaspectratiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper"]], "videoresizeresolutionmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper"]], "videosplitbydurationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper"]], "videosplitbykeyframemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper"]], "videosplitbyscenemapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper"]], "videotaggingfromaudiomapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper"]], "videotaggingfromframesmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper"]], "whitespacenormalizationmapper (class in data_juicer.ops.mapper)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper"]], "__init__() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.__init__"]], "__init__() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.__init__"]], "__init__() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.__init__"]], "__init__() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.__init__"]], "__init__() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.__init__"]], "__init__() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.__init__"]], "__init__() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.__init__"]], "__init__() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.__init__"]], "__init__() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.__init__"]], "__init__() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.__init__"]], "__init__() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.__init__"]], "__init__() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.__init__"]], "avaliable_detectors (data_juicer.ops.mapper.videosplitbyscenemapper attribute)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.avaliable_detectors"]], "data_juicer.ops.mapper": [[9, "module-data_juicer.ops.mapper"]], "get_split_key_frame() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.get_split_key_frame"]], "process() (data_juicer.ops.mapper.audioffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.AudioFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.chineseconvertmapper method)": [[9, "data_juicer.ops.mapper.ChineseConvertMapper.process"]], "process() (data_juicer.ops.mapper.cleancopyrightmapper method)": [[9, "data_juicer.ops.mapper.CleanCopyrightMapper.process"]], "process() (data_juicer.ops.mapper.cleanemailmapper method)": [[9, "data_juicer.ops.mapper.CleanEmailMapper.process"]], "process() (data_juicer.ops.mapper.cleanhtmlmapper method)": [[9, "data_juicer.ops.mapper.CleanHtmlMapper.process"]], "process() (data_juicer.ops.mapper.cleanipmapper method)": [[9, "data_juicer.ops.mapper.CleanIpMapper.process"]], "process() (data_juicer.ops.mapper.cleanlinksmapper method)": [[9, "data_juicer.ops.mapper.CleanLinksMapper.process"]], "process() (data_juicer.ops.mapper.expandmacromapper method)": [[9, "data_juicer.ops.mapper.ExpandMacroMapper.process"]], "process() (data_juicer.ops.mapper.extractqamapper method)": [[9, "data_juicer.ops.mapper.ExtractQAMapper.process"]], "process() (data_juicer.ops.mapper.fixunicodemapper method)": [[9, "data_juicer.ops.mapper.FixUnicodeMapper.process"]], "process() (data_juicer.ops.mapper.imageblurmapper method)": [[9, "data_juicer.ops.mapper.ImageBlurMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningfromgpt4vmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningFromGPT4VMapper.process"]], "process() (data_juicer.ops.mapper.imagecaptioningmapper method)": [[9, "data_juicer.ops.mapper.ImageCaptioningMapper.process"]], "process() (data_juicer.ops.mapper.imagediffusionmapper method)": [[9, "data_juicer.ops.mapper.ImageDiffusionMapper.process"]], "process() (data_juicer.ops.mapper.imagefaceblurmapper method)": [[9, "data_juicer.ops.mapper.ImageFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.nlpaugenmapper method)": [[9, "data_juicer.ops.mapper.NlpaugEnMapper.process"]], "process() (data_juicer.ops.mapper.nlpcdazhmapper method)": [[9, "data_juicer.ops.mapper.NlpcdaZhMapper.process"]], "process() (data_juicer.ops.mapper.punctuationnormalizationmapper method)": [[9, "data_juicer.ops.mapper.PunctuationNormalizationMapper.process"]], "process() (data_juicer.ops.mapper.removebibliographymapper method)": [[9, "data_juicer.ops.mapper.RemoveBibliographyMapper.process"]], "process() (data_juicer.ops.mapper.removecommentsmapper method)": [[9, "data_juicer.ops.mapper.RemoveCommentsMapper.process"]], "process() (data_juicer.ops.mapper.removeheadermapper method)": [[9, "data_juicer.ops.mapper.RemoveHeaderMapper.process"]], "process() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.process"]], "process() (data_juicer.ops.mapper.removenonchinesecharacterlmapper method)": [[9, "data_juicer.ops.mapper.RemoveNonChineseCharacterlMapper.process"]], "process() (data_juicer.ops.mapper.removerepeatsentencesmapper method)": [[9, "data_juicer.ops.mapper.RemoveRepeatSentencesMapper.process"]], "process() (data_juicer.ops.mapper.removespecificcharsmapper method)": [[9, "data_juicer.ops.mapper.RemoveSpecificCharsMapper.process"]], "process() (data_juicer.ops.mapper.removetabletextmapper method)": [[9, "data_juicer.ops.mapper.RemoveTableTextMapper.process"]], "process() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.process"]], "process() (data_juicer.ops.mapper.replacecontentmapper method)": [[9, "data_juicer.ops.mapper.ReplaceContentMapper.process"]], "process() (data_juicer.ops.mapper.sentencesplitmapper method)": [[9, "data_juicer.ops.mapper.SentenceSplitMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromsummarizermapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromSummarizerMapper.process"]], "process() (data_juicer.ops.mapper.videocaptioningfromvideomapper method)": [[9, "data_juicer.ops.mapper.VideoCaptioningFromVideoMapper.process"]], "process() (data_juicer.ops.mapper.videoffmpegwrappedmapper method)": [[9, "data_juicer.ops.mapper.VideoFFmpegWrappedMapper.process"]], "process() (data_juicer.ops.mapper.videofaceblurmapper method)": [[9, "data_juicer.ops.mapper.VideoFaceBlurMapper.process"]], "process() (data_juicer.ops.mapper.videoremovewatermarkmapper method)": [[9, "data_juicer.ops.mapper.VideoRemoveWatermarkMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeaspectratiomapper method)": [[9, "data_juicer.ops.mapper.VideoResizeAspectRatioMapper.process"]], "process() (data_juicer.ops.mapper.videoresizeresolutionmapper method)": [[9, "data_juicer.ops.mapper.VideoResizeResolutionMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbykeyframemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByKeyFrameMapper.process"]], "process() (data_juicer.ops.mapper.videosplitbyscenemapper method)": [[9, "data_juicer.ops.mapper.VideoSplitBySceneMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromaudiomapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromAudioMapper.process"]], "process() (data_juicer.ops.mapper.videotaggingfromframesmapper method)": [[9, "data_juicer.ops.mapper.VideoTaggingFromFramesMapper.process"]], "process() (data_juicer.ops.mapper.whitespacenormalizationmapper method)": [[9, "data_juicer.ops.mapper.WhitespaceNormalizationMapper.process"]], "should_keep_long_word() (data_juicer.ops.mapper.removelongwordsmapper method)": [[9, "data_juicer.ops.mapper.RemoveLongWordsMapper.should_keep_long_word"]], "should_keep_word_with_incorrect_substrings() (data_juicer.ops.mapper.removewordswithincorrectsubstringsmapper method)": [[9, "data_juicer.ops.mapper.RemoveWordsWithIncorrectSubstringsMapper.should_keep_word_with_incorrect_substrings"]], "split_videos_by_duration() (data_juicer.ops.mapper.videosplitbydurationmapper method)": [[9, "data_juicer.ops.mapper.VideoSplitByDurationMapper.split_videos_by_duration"]], "frequencyspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector"]], "randomselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RandomSelector"]], "rangespecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector"]], "topkspecifiedfieldselector (class in data_juicer.ops.selector)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector"]], "__init__() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.__init__"]], "__init__() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.__init__"]], "__init__() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.__init__"]], "data_juicer.ops.selector": [[10, "module-data_juicer.ops.selector"]], "process() (data_juicer.ops.selector.frequencyspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.FrequencySpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.randomselector method)": [[10, "data_juicer.ops.selector.RandomSelector.process"]], "process() (data_juicer.ops.selector.rangespecifiedfieldselector method)": [[10, "data_juicer.ops.selector.RangeSpecifiedFieldSelector.process"]], "process() (data_juicer.ops.selector.topkspecifiedfieldselector method)": [[10, "data_juicer.ops.selector.TopkSpecifiedFieldSelector.process"]], "data_juicer.tools": [[11, "module-data_juicer.tools"]], "data_juicer.utils": [[12, "module-data_juicer.utils"]]}}) \ No newline at end of file