diff --git a/_modules/data_juicer/analysis/column_wise_analysis.html b/_modules/data_juicer/analysis/column_wise_analysis.html deleted file mode 100644 index 0594bdff5..000000000 --- a/_modules/data_juicer/analysis/column_wise_analysis.html +++ /dev/null @@ -1,408 +0,0 @@ - - - - - - - - data_juicer.analysis.column_wise_analysis — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.column_wise_analysis

-import math
-import os
-
-import matplotlib.pyplot as plt
-import pandas as pd
-from tqdm import tqdm
-
-from data_juicer.utils.constant import Fields
-
-from .overall_analysis import OverallAnalysis
-
-
-def get_row_col(total_num, factor=2):
-    """
-    Given the total number of stats figures, get the "best" number of rows and
-    columns. This function is needed when we need to store all stats figures
-    into one image.
-
-    :param total_num: Total number of stats figures
-    :param factor: Number of sub-figure types in each figure. In
-        default, it's 2, which means there are histogram and box plot
-        for each stat figure
-    :return: "best" number of rows and columns, and the grid list
-    """
-    n = total_num * factor  # actual number of figures
-    now_col = factor  # search from the minimum number of columns
-    now_row = total_num
-    for col in range(factor, n + 1, factor):
-        row = n * 1.0 / col
-        if row != int(row):  # skip non-integer results
-            continue
-        if col > row:
-            # object: minimum the difference between number of columns and rows
-            if abs(col - row) > abs(now_col - now_row):
-                break
-            else:
-                now_row = row
-                now_col = col
-                break
-        now_row = row
-        now_col = col
-
-    # different sub-figures of the same stats should be in the same row
-    now_col = now_col // factor
-
-    # get grid indexes
-    grids = []
-    for i in range(total_num):
-        grids.append((i // now_col, i % now_col))
-
-    return int(now_row), int(now_col), grids
-
-
-
[docs]class ColumnWiseAnalysis: - """Apply analysis on each column of stats respectively.""" - -
[docs] def __init__(self, - dataset, - output_path, - overall_result=None, - save_stats_in_one_file=True): - """ - Initialization method - - :param dataset: the dataset to be analyzed - :param output_path: path to store the analysis results - :param overall_result: optional precomputed overall stats result - :param save_stats_in_one_file: whether save all analysis figures of all - stats into one image file - """ - self.stats = pd.DataFrame(dataset[Fields.stats]) - self.output_path = output_path - if not os.path.exists(self.output_path): - os.makedirs(self.output_path) - - # if no overall description provided, analyze it from scratch - if overall_result is None: - oa = OverallAnalysis(dataset, output_path) - overall_result = oa.analyze() - self.overall_result = overall_result - - self.save_stats_in_one_file = save_stats_in_one_file
- -
[docs] def analyze(self, show_percentiles=False, show=False, skip_export=False): - """ - Apply analysis and draw the analysis figure for stats. - - :param show_percentiles: whether to show the percentile line in - each sub-figure. If it's true, there will be several red - lines to indicate the quantiles of the stats distributions - :param show: whether to show in a single window after drawing - :param skip_export: whether save the results into disk - :return: - """ - # number of sub-figures for each stat. There are histogram and box plot - # for now, so it's 2. - num_subcol = 2 - - # Default width and height unit for each sub-figure - width_unit = 4 - height_unit = 6 - - columns = self.stats.columns - num = len(columns) - - # get the recommended "best" number of columns and rows - rec_row, rec_col, grid_indexes = get_row_col(num, num_subcol) - - if self.save_stats_in_one_file: - # if save_stats_in_one_file is opened, use recommended "best" - # number of columns and rows to initialize the image panel. - rec_width = rec_col * num_subcol * width_unit - rec_height = rec_row * height_unit - fig = plt.figure(figsize=(rec_width, rec_height), - layout='constrained') - subfigs = fig.subfigures(rec_row, rec_col, wspace=0.01) - for i, column_name in enumerate(tqdm(columns.to_list(), - desc='Column')): - data = self.stats[column_name] - # explode data to flatten inner list - data = data.explode().infer_objects() - grid = grid_indexes[i] - if self.save_stats_in_one_file: - if rec_col == 1: - grid = grid[0] - elif rec_row == 1: - grid = grid[1] - - if rec_col == 1 and rec_row == 1: - subfig = subfigs - else: - subfig = subfigs[grid] - subfig.set_facecolor('0.85') - - # numeric or string via nan. Apply different plot method for them. - if pd.isna(self.overall_result[column_name].get('top')): - # numeric or numeric list -- draw histogram and box plot for - # this stat - percentiles = self.overall_result[column_name] \ - if show_percentiles else None - - # get axes for each subplot - if self.save_stats_in_one_file: - axes = subfig.subplots(1, num_subcol) - else: - axes = [None] * num_subcol - - # draw histogram - self.draw_hist(axes[0], - data, - os.path.join(self.output_path, - f'{column_name}-hist.png'), - percentiles=percentiles) - - # draw box - self.draw_box(axes[1], - data, - os.path.join(self.output_path, - f'{column_name}-box.png'), - percentiles=percentiles) - else: - # object (string) or string list -- only draw histogram for - # this stat - if self.save_stats_in_one_file: - axes = subfig.subplots(1, 1) - else: - axes = None - - if not skip_export: - self.draw_hist( - axes, data, - os.path.join(self.output_path, - f'{column_name}-hist.png')) - - # add a title to the figure of this stat - if self.save_stats_in_one_file: - subfig.suptitle(f'{data.name}', - fontsize='x-large', - fontweight='bold') - - if self.save_stats_in_one_file: - fig = plt.gcf() - if not skip_export: - fig.savefig(os.path.join(self.output_path, 'all-stats.png')) - if show: - plt.show() - else: - pass - # TODO: (fixme) the saved png sometime are blank - plt.clf()
- -
[docs] def draw_hist(self, ax, data, save_path, percentiles=None, show=False): - """ - Draw the histogram for the data. - - :param ax: the axes to draw - :param data: data to draw - :param save_path: the path to save the histogram figure - :param percentiles: the overall analysis result of the data - including percentile information - :param show: whether to show in a single window after drawing - :return: - """ - # recommended number of bins - data_num = len(data) - if data_num >= 100: - rec_bins = int(math.sqrt(len(data))) - else: - rec_bins = None - - # if ax is None, using plot method in pandas - if ax is None: - ax = data.hist(bins=rec_bins, figsize=(20, 16)) - else: - ax.hist(data, bins=rec_bins) - - # set axes - ax.set_xlabel(data.name) - ax.set_ylabel('Count') - - # draw percentile lines if it's not None - if percentiles is not None: - ymin, ymax = ax.get_ylim() - for percentile in percentiles.keys(): - # skip other information - if percentile in {'count', 'unique', 'top', 'freq', 'std'}: - continue - value = percentiles[percentile] - - ax.vlines(x=value, ymin=ymin, ymax=ymax, colors='r') - ax.text(x=value, y=ymax, s=percentile, rotation=30, color='r') - ax.text(x=value, - y=ymax * 0.97, - s=str(round(value, 3)), - rotation=30, - color='r') - - if not self.save_stats_in_one_file: - # save into file - plt.savefig(save_path) - - if show: - plt.show() - else: - # if no showing, we need to clear this axes to avoid - # accumulated overlapped figures in different draw_xxx function - # calling - ax.clear() - else: - # add a little rotation on labels of x axis to avoid overlapping - ax.tick_params(axis='x', rotation=25)
- -
[docs] def draw_box(self, ax, data, save_path, percentiles=None, show=False): - """ - Draw the box plot for the data. - - :param ax: the axes to draw - :param data: data to draw - :param save_path: the path to save the box figure - :param percentiles: the overall analysis result of the data - including percentile information - :param show: whether to show in a single window after drawing - :return: - """ - # if ax is None, using plot method in pandas - if ax is None: - ax = data.plot.box(figsize=(20, 16)) - else: - ax.boxplot(data) - - # set axes - ax.set_ylabel(data.name) - - # draw percentile lines if it's not None - if percentiles is not None: - xmin, xmax = ax.get_xlim() - for percentile in percentiles.keys(): - # skip other information - if percentile in {'count', 'unique', 'top', 'freq', 'std'}: - continue - value = percentiles[percentile] - - ax.hlines(y=value, xmin=xmin, xmax=xmax, colors='r') - ax.text(y=value, - x=xmin + (xmax - xmin) * 0.6, - s=f'{percentile}: {round(value, 3)}', - color='r') - - if not self.save_stats_in_one_file: - # save into file - plt.savefig(save_path) - - if show: - plt.show() - else: - # if no showing, we need to clear this axes to avoid - # accumulated overlapped figures in different draw_xxx function - # calling - ax.clear()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/diversity_analysis.html b/_modules/data_juicer/analysis/diversity_analysis.html deleted file mode 100644 index 430c1c820..000000000 --- a/_modules/data_juicer/analysis/diversity_analysis.html +++ /dev/null @@ -1,267 +0,0 @@ - - - - - - - - data_juicer.analysis.diversity_analysis — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.diversity_analysis

-import os
-
-import pandas as pd
-import spacy
-from loguru import logger
-
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-
-# Modify from self_instruct, please refer to
-# https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb
-def find_root_verb_and_its_dobj(tree_root):
-    """
-    Find the verb and its object closest to the root.
-
-    :param tree_root: the root of lexical tree
-    :return: valid verb and its object.
-    """
-    # first check if the current node and its children satisfy the condition
-    if tree_root.pos_ == 'VERB':
-        for child in tree_root.children:
-            if child.dep_ == 'dobj' and child.pos_ == 'NOUN':
-                return tree_root.lemma_ if len(
-                    tree_root.lemma_) else tree_root.text, child.lemma_ if len(
-                        child.lemma_) else child.text
-        return tree_root.lemma_ if len(
-            tree_root.lemma_) else tree_root.text, None
-    # if not, check its children
-    for child in tree_root.children:
-        return find_root_verb_and_its_dobj(child)
-    # if no children satisfy the condition, return None
-    return None, None
-
-
-# Modify from self_instruct, please refer to
-# https://github.com/yizhongw/self-instruct/blob/main/self_instruct/instruction_visualize.ipynb
-def find_root_verb_and_its_dobj_in_string(nlp, s, first_sent=True):
-    """
-    Find the verb and its object closest to the root of lexical tree of input
-    string.
-
-    :param nlp: the diversity model to analyze the diversity strings
-    :param s: the string to be analyzed
-    :param first_sent: whether to analyze the first sentence in the
-        input string only. If it's true, return the analysis result of
-        the first sentence no matter it's valid or not. If it's false,
-        return the first valid result over all sentences
-    :return: valid verb and its object of this string
-    """
-    doc = nlp(s)
-    for sent in doc.sents:
-        verb, noun = find_root_verb_and_its_dobj(sent.root)
-        if first_sent or (verb is not None and noun is not None):
-            return verb, noun
-    return None, None
-
-
-def get_diversity(dataset, top_k_verbs=20, top_k_nouns=4, **kwargs):
-    """
-    Given the lexical tree analysis result, return the diversity results.
-
-    :param dataset: lexical tree analysis result
-    :param top_k_verbs: only keep the top_k_verbs largest verb groups
-    :param top_k_nouns: only keep the top_k_nouns largest noun groups
-        for each verb group
-    :param kwargs: extra args
-    :return: the diversity results
-    """
-    phrases = pd.DataFrame(dataset).dropna()
-    logger.info(f'find valid verb-noun structure \
-                {phrases.shape[0]} of {dataset.shape[0]}')
-    top_verbs = phrases.groupby(['verb'
-                                 ]).size().nlargest(top_k_verbs).reset_index()
-
-    df = phrases[phrases['verb'].isin(top_verbs['verb'].tolist())]
-    df = df.groupby(['verb', 'noun']).size().reset_index().rename(columns={
-        0: 'count'
-    }).sort_values(by=['count'], ascending=False)
-
-    df = df.groupby('verb').apply(lambda x: x.sort_values(
-        'count', ascending=False).head(top_k_nouns)).reset_index(drop=True)
-    return df
-
-
-
[docs]class DiversityAnalysis: - """Apply diversity analysis for each sample and get an overall analysis - result.""" - -
[docs] def __init__(self, dataset, output_path, lang_or_model='en'): - """Initialization method :param dataset: the dataset to be analyzed - :param output_path: path to store the analysis results :param - lang_or_model: the diversity model or a specific language used to load - the diversity model.""" - - self.dataset = dataset - self.output_path = output_path - if not os.path.exists(self.output_path): - os.makedirs(self.output_path) - self.lang_or_model = lang_or_model
- -
[docs] def compute(self, lang_or_model=None, column_name='text'): - """ - Apply lexical tree analysis on each sample. - - :param lang_or_model: the diversity model or a specific language - used to load the diversity model - :param column_name: the name of column to be analyzed - :return: the analysis result. - """ - # load diversity model - lang_or_model = lang_or_model if lang_or_model else self.lang_or_model - if isinstance(lang_or_model, str): - model_key = prepare_model('spacy', lang=lang_or_model) - diversity_model = get_model(model_key) - else: - diversity_model = lang_or_model - - assert isinstance(diversity_model, spacy.Language) - - def find_verb_noun(sample): - try: - verb, noun = find_root_verb_and_its_dobj_in_string( - diversity_model, sample[column_name]) - except Exception as e: - print(str(e)) - verb, noun = None, None - return {'verb': verb, 'noun': noun} - - dataset = self.dataset.map(find_verb_noun) - return pd.DataFrame(dataset)
- -
[docs] def analyze(self, - lang_or_model=None, - column_name='text', - postproc_func=get_diversity, - **postproc_kwarg): - """ - Apply diversity analysis on the whole dataset. - - :param lang_or_model: the diversity model or a specific language - used to load the diversity model - :param column_name: the name of column to be analyzed - :param postproc_func: function to analyze diversity. In default, - it's function get_diversity - :param postproc_kwarg: arguments of the postproc_func - :return: - """ - # get the lexical tree analysis result - raw_df = self.compute(lang_or_model=lang_or_model, - column_name=column_name) - # get the result of diversity analysis - df = postproc_func(raw_df, **postproc_kwarg) - - # export to result report file - df.to_csv(os.path.join(self.output_path, 'diversity.csv')) - df.to_markdown(os.path.join(self.output_path, 'diversity.md')) - - return df
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/analysis/overall_analysis.html b/_modules/data_juicer/analysis/overall_analysis.html deleted file mode 100644 index 2b8bef7c6..000000000 --- a/_modules/data_juicer/analysis/overall_analysis.html +++ /dev/null @@ -1,204 +0,0 @@ - - - - - - - - data_juicer.analysis.overall_analysis — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.analysis.overall_analysis

-import os
-from multiprocessing import Pool
-
-import pandas as pd
-from loguru import logger
-from tqdm import tqdm
-
-from data_juicer.utils.constant import Fields
-
-
-def _single_column_analysis(col, *args, **kwargs):
-    col_overall = col.describe(*args, **kwargs)
-    return col_overall
-
-
-
[docs]class OverallAnalysis: - """Apply analysis on the overall stats, including mean, std, quantiles, - etc.""" - -
[docs] def __init__(self, dataset, output_path): - """ - Initialization method. - - :param dataset: the dataset to be analyzed - :param output_path: path to store the analysis results. - """ - self.stats = pd.DataFrame(dataset[Fields.stats]) - self.output_path = output_path - if not os.path.exists(self.output_path): - os.makedirs(self.output_path) - - # default percentiles to analyze - self.default_percentiles = [0.25, 0.5, 0.75] - # supported dtypes of column to be analyzed - # Notice: there won't be mixed types in a column because the stats is - # obtained from Dataset, which doesn't allow mixed types. - # Notice: for now, stats can only be: - # {numbers, string, list of one of before} - self.supported_object_types = {str, list}
- -
[docs] def refine_single_column(self, col): - if col.dtype != 'object': - # not an object, return directly - return col - # if the type of this column is object, we can decide the actual type - # according to the first element. - first = col[0] - if type(first) not in self.supported_object_types: - logger.warning(f'There is a column of stats with type ' - f'[{type(first)}], which is not supported to be ' - f'analyzed for now.') - return None - if type(first) is str: - # describe(include = 'all') can analyze the string type - return col - elif type(first) is list: - # flatten and infer the type - col = col.explode().infer_objects() - return col
- -
[docs] def analyze(self, percentiles=[], num_proc=1, skip_export=False): - """ - Apply overall analysis on the whole dataset based on the describe - method of pandas. - - :param percentiles: percentiles to analyze - :param num_proc: number of processes to analyze the dataset - :param skip_export: whether export the results to disk - :return: the overall analysis result. - """ - # merge default and customized percentiles and get overall information - percentiles = list(set(percentiles + self.default_percentiles)) - - results = [] - pool = Pool(num_proc) - for col_name in self.stats.columns: - this_col = self.refine_single_column(self.stats[col_name]) - res = pool.apply_async(_single_column_analysis, - kwds={ - 'col': this_col, - 'percentiles': percentiles, - 'include': 'all', - }) - results.append(res) - pool.close() - pool.join() - result_cols = [res.get() for res in tqdm(results)] - overall = pd.DataFrame(result_cols).T - - # export to result report file - if not skip_export: - overall.to_csv(os.path.join(self.output_path, 'overall.csv')) - overall.to_markdown(os.path.join(self.output_path, 'overall.md')) - - return overall
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/adapter.html b/_modules/data_juicer/core/adapter.html deleted file mode 100644 index e2580fae4..000000000 --- a/_modules/data_juicer/core/adapter.html +++ /dev/null @@ -1,257 +0,0 @@ - - - - - - - - data_juicer.core.adapter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.adapter

-from datasets.config import DEFAULT_MAX_BATCH_SIZE
-
-from data_juicer.core.monitor import Monitor
-
-
-
[docs]class Adapter: - - MAX_BATCH_SIZE = 10000 - -
[docs] def __init__(self, cfg: dict): - self.cfg = cfg - self.idle_resources = Monitor.monitor_current_resources()
- -
[docs] @staticmethod - def execute_and_probe(dataset, operators, sample_interval=0.5): - """ - Process the input dataset and probe related information for each OP in - the specified operator list. - - For now, we support the following targets to probe: - "resource": resource utilization for each OP. - "speed": average processing speed for each OP. - - The probe result is a list and each item in the list is the probe - result for each OP. - """ - if operators is None or len(operators) == 0: - return [] - - # resource utilization list - resource_util_list = [] - # probe for each OP - for op in operators: - # set num_proc to 1 for each OP to focus on the influence of batch - # size only. - old_num_proc = op.num_proc - op.num_proc = 1 - - # number of test samples - sample_num = len(dataset) - # run single op and monitor the resource utilization - dataset, resource_util_per_op = Monitor.monitor_func( - op.run, args=(dataset, ), sample_interval=sample_interval) - - # calculate speed - resource_util_per_op[ - 'speed'] = sample_num / resource_util_per_op['time'] - resource_util_list.append(resource_util_per_op) - - # restore to the original num_proc - op.num_proc = old_num_proc - - return resource_util_list
- -
[docs] @staticmethod - def take_batch(dataset, config): - """ - Split the dataset into batches based on configuration and load factor. - - :param dataset: The dataset to be split - :param config: Configuration settings, including batch size - :return: An iterator of batches - """ - # get initial batch size - batch_size = config.get('batch_size', DEFAULT_MAX_BATCH_SIZE) - # should be in [1, 10000] - batch_size = min(max(batch_size, 1), Adapter.MAX_BATCH_SIZE) - - # check if there are enough samples - num_samples = len(dataset) - if batch_size >= num_samples: - return dataset - else: - return dataset.take(batch_size)
- -
[docs] def adapt_workloads(self, dataset, operators): - """ - Manage the scheduling and load balancing for the dataset processing. - - :param dataset: The dataset that needs to be processed - :param operators: Operators in the data recipe - """ - # TODO: set batch size to 1 for all OPs for probing - load_analysis_res, probed_batch_size = self.probe_small_batch( - dataset, operators) - - # calculate batch size for each OP according to the analysis results - bs_per_op = self.batch_size_strategy(load_analysis_res, - base_bs=probed_batch_size) - - return bs_per_op
- -
[docs] def probe_small_batch(self, dataset, operators): - """ - Perform small batch pre-execution to probe available resources, - current load and estimated OP speed, returning load factors and speed - ranks for each OP. - - :param dataset: The dataset to pre-execute small batch on - :param operators: The OP list to be pre-execution and probe - :return: A list of probe results for each OP and the length of data - batch to probe. - """ - # take a small batch - data_batch = self.take_batch(dataset, self.cfg) - # process and monitor the resource utilization - resource_util_list = self.execute_and_probe(data_batch, operators) - # analyze resource utilization - analysis_res = Monitor.analyze_resource_util_list(resource_util_list) - - return analysis_res, len(data_batch)
- -
[docs] def batch_size_strategy(self, load_analysis_res, base_bs=1, util_th=0.9): - """ - Decide the batch size for each op according to their workload analysis - result and expected utilization threshold. We need to guarantee that - the resource utilization won't exceed the threshold. Now we only - consider the buckets effect, which means the max batch size is decided - by the max utilization of all types of resources except GPU util - (decided by num_proc). - """ - batch_size_per_op = [] - - # compute left utils according to the util_th - left_utils = {} - for key in self.idle_resources: - if 'util.' not in key or 'GPU' in key: - continue - left_utils[key] = max(0, util_th - self.idle_resources[key]) - - for item in load_analysis_res: - max_util = 1e-5 - max_key = min(left_utils.items(), key=lambda it: it[1])[0] - analysis_res = item['resource_analysis'] - for key in analysis_res: - if 'util.' not in key or 'GPU' in key: - continue - used_util = max( - 0, analysis_res[key]['max'] - self.idle_resources[key]) - if used_util > max_util: - max_util = used_util - max_key = key - load_factor = left_utils[max_key] / max_util - bs_this_op = min(max(int(base_bs * load_factor), 1), - self.MAX_BATCH_SIZE) - batch_size_per_op.append(bs_this_op) - - return batch_size_per_op
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/analyzer.html b/_modules/data_juicer/core/analyzer.html deleted file mode 100644 index 0ec3d9f2c..000000000 --- a/_modules/data_juicer/core/analyzer.html +++ /dev/null @@ -1,249 +0,0 @@ - - - - - - - - data_juicer.core.analyzer — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.analyzer

-import os
-from typing import Optional
-
-from jsonargparse import Namespace
-from loguru import logger
-from pydantic import PositiveInt
-
-from data_juicer.analysis import ColumnWiseAnalysis, OverallAnalysis
-from data_juicer.config import init_configs
-from data_juicer.format import load_formatter
-from data_juicer.ops import Filter, load_ops
-from data_juicer.utils import cache_utils
-
-from .exporter import Exporter
-
-
-
[docs]class Analyzer: - """ - This Analyzer class is used to analyze a specific dataset. - - It will compute stats for all filter ops in the config file, apply - multiple analysis (e.g. OverallAnalysis, ColumnWiseAnalysis, etc.) - on these stats, and generate the analysis results (stats tables, - distribution figures, etc.) to help users understand the input - dataset better. - """ - -
[docs] def __init__(self, cfg: Optional[Namespace] = None): - """ - Initialization method. - - :param cfg: optional jsonargparse Namespace dict. - """ - self.cfg = init_configs() if cfg is None else cfg - - self.work_dir = self.cfg.work_dir - - if self.cfg.use_cache: - logger.info(f'Using cache compression method: ' - f'[{self.cfg.cache_compress}]') - cache_utils.CACHE_COMPRESS = self.cfg.cache_compress - - # setup formatter - logger.info('Setting up data formatter...') - self.formatter = load_formatter( - dataset_path=self.cfg.dataset_path, - generated_dataset_config=self.cfg.generated_dataset_config, - text_keys=self.cfg.text_keys, - suffixes=self.cfg.suffixes, - add_suffix=self.cfg.add_suffix) - - # prepare exporter and check export path suffix - # NOTICE: no need to export dataset texts for analyzer - # (export_ds=False). Instead, only need to export stats - # (export_stats=True). - logger.info('Preparing exporter...') - self.exporter = Exporter( - self.cfg.export_path, - self.cfg.export_shard_size, - self.cfg.export_in_parallel, - self.cfg.np, - export_ds=self.cfg.export_original_dataset, - keep_stats_in_res_ds=self.cfg.export_original_dataset, - export_stats=True) - - # parsed_res - self.overall_result = None - self.overall_single_plot_path = None - self.analysis_path = os.path.join(self.cfg.work_dir, 'analysis')
- -
[docs] def run(self, - load_data_np: Optional[PositiveInt] = None, - skip_export: bool = False, - skip_return: bool = False): - """ - Running the dataset analysis pipeline. - - :param load_data_np: number of workers when loading the dataset. - :param skip_export: whether export the results into disk - :param skip_return: skip return for API called. - :return: analyzed dataset. - """ - # 1. format data - logger.info('Loading dataset from data formatter...') - if load_data_np is None: - load_data_np = self.cfg.np - dataset = self.formatter.load_dataset(load_data_np, self.cfg) - - # extract processes - logger.info('Preparing process operators...') - ops = load_ops(self.cfg.process, self.cfg.op_fusion) - - # 2. stats precompute only for filter ops - logger.info('Computing the stats of dataset...') - stats_collected = False - for op in ops: - if isinstance(op, Filter): - original_process = op.process - op.process = None - dataset = dataset.process(op, work_dir=self.work_dir) - op.process = original_process - stats_collected = True - if not stats_collected: - logger.warning('No stats collected. Please add some Filter ops to ' - 'the process list in configs.') - return dataset - - # 3. data export - logger.info('Exporting dataset to disk...') - self.exporter.export(dataset) - if self.cfg.use_cache and self.cfg.cache_compress: - from data_juicer.utils.compress import compress - compress(dataset) - - # 4. analysis and output result to the export path - # 4.1. Only consider fields in Fields.stats - # 4.2. For string fields, only consider its histogram - # 4.3. For numeric fields, consider its histogram and box - # 4.4. Otherwise, DO NOT analyze - - logger.info('Applying overall analysis on stats...') - overall_analysis = OverallAnalysis(dataset, self.analysis_path) - self.overall_result = overall_analysis.analyze( - percentiles=self.cfg.percentiles, - num_proc=self.cfg.np, - skip_export=skip_export) - - logger.info(f'The overall analysis results are: {self.overall_result}') - - logger.info('Applying column-wise analysis on stats...') - column_wise_analysis = ColumnWiseAnalysis( - dataset, - self.analysis_path, - overall_result=self.overall_result, - save_stats_in_one_file=self.cfg.save_stats_in_one_file, - ) - column_wise_analysis.analyze(skip_export=skip_export) - - if not skip_return: - return dataset
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/data.html b/_modules/data_juicer/core/data.html deleted file mode 100644 index 2a0629f31..000000000 --- a/_modules/data_juicer/core/data.html +++ /dev/null @@ -1,555 +0,0 @@ - - - - - - - - data_juicer.core.data — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.data

-from __future__ import annotations
-
-import copy
-import inspect
-import json
-import os
-import traceback
-from abc import ABC, abstractmethod
-from functools import wraps
-from time import time
-from typing import Union
-
-from datasets import Dataset, DatasetDict, is_caching_enabled
-from datasets.formatting.formatting import LazyBatch
-from loguru import logger
-
-from data_juicer.core.monitor import Monitor
-from data_juicer.ops import UNFORKABLE
-from data_juicer.utils import cache_utils
-from data_juicer.utils.compress import (CompressionOff,
-                                        cleanup_compressed_cache_files,
-                                        compress, decompress)
-from data_juicer.utils.fingerprint_utils import generate_fingerprint
-from data_juicer.utils.process_utils import setup_mp
-
-
-class DJDataset(ABC):
-    """Base dataset of DJ"""
-
-    @abstractmethod
-    def process(
-            self,
-            operators,  # TODO: add type hint
-            *,
-            exporter=None,
-            checkpointer=None,
-            tracer=None) -> DJDataset:
-        """process a list of operators on the dataset."""
-        pass
-
-
-def wrap_func_with_nested_access(f):
-    """
-    Before conducting actual function `f`, wrap its args and kargs into nested
-    ones.
-
-    :param f: function to be wrapped.
-    :return: wrapped function
-    """
-
-    def wrap_nested_structure(*args, **kargs):
-        wrapped_args = [nested_obj_factory(arg) for arg in args]
-        wrapped_kargs = {
-            k: nested_obj_factory(arg)
-            for k, arg in kargs.items()
-        }
-        return wrapped_args, nested_obj_factory(wrapped_kargs)
-
-    @wraps(f)
-    def wrapped_f(*args, **kargs):
-        args, kargs = wrap_nested_structure(*args, **kargs)
-        # to ensure the args passing to the final calling of f can be nested,
-        # in case of deeper-order wrapper funcs de-wrap this nesting behavior
-        args = [
-            wrap_func_with_nested_access(arg) if callable(arg) else arg
-            for arg in args
-        ]
-        kargs = {
-            k: (wrap_func_with_nested_access(arg) if callable(arg) else arg)
-            for (k, arg) in kargs.items()
-        }
-        return f(*args, **kargs)
-
-    return wrapped_f
-
-
-def nested_obj_factory(obj):
-    """
-    Use nested classes to wrap the input object.
-
-    :param obj: object to be nested.
-    :return: nested object
-    """
-    if isinstance(obj, Dataset):
-        return NestedDataset(obj)
-    elif isinstance(obj, DatasetDict):
-        return NestedDatasetDict(obj)
-    elif isinstance(obj, dict):
-        return NestedQueryDict(obj)
-    elif isinstance(obj, LazyBatch):
-        obj.data = NestedQueryDict(obj.data)
-        return obj
-    elif isinstance(obj, list):
-        return [nested_obj_factory(item) for item in obj]
-    else:
-        return obj
-
-
-class NestedQueryDict(dict):
-    """Enhanced dict for better usability."""
-
-    def __init__(self, *args, **kargs):
-        if len(args) == 1 and isinstance(args[0], Dataset):
-            # init from another DatasetDict instance
-            self.__dict__ = copy.copy(args[0].__dict__)
-        else:
-            # init from scratch
-            super().__init__(*args, **kargs)
-
-        # batched sample, (k & v) are organized by list manner
-        for k, v in self.items():
-            if isinstance(v, list) and len(v) > 0 and isinstance(v[0], dict):
-                self[k] = [NestedQueryDict(item) for item in v]
-
-    def __getitem__(self, key):
-        return nested_query(self, key)
-
-
-class NestedDatasetDict(DatasetDict):
-    """Enhanced HuggingFace-DatasetDict for better usability and efficiency."""
-
-    def __init__(self, *args, **kargs):
-        if len(args) == 1 and isinstance(args[0], Dataset):
-            # init from another DatasetDict instance
-            self.__dict__ = copy.copy(args[0].__dict__)
-        else:
-            # init from scratch
-            super().__init__(*args, **kargs)
-
-    def __getitem__(self, key):
-        return nested_query(self, key)
-
-    def map(self, **args):
-        """Override the map func, which is called by most common operations,
-        such that the processed samples can be accessed by nested manner."""
-        if 'function' not in args or args['function'] is None:
-            args['function'] = lambda x: nested_obj_factory(x)
-        else:
-            args['function'] = wrap_func_with_nested_access(args['function'])
-
-        return super().map(**args)
-
-
-
[docs]class NestedDataset(Dataset, DJDataset): - """Enhanced HuggingFace-Dataset for better usability and efficiency.""" - -
[docs] def __init__(self, *args, **kargs): - if len(args) == 1 and isinstance(args[0], Dataset): - # init from another Dataset instance - self.__dict__ = copy.copy(args[0].__dict__) - else: - # init from scratch - super().__init__(*args, **kargs) - - self.need_to_cleanup_caches = not is_caching_enabled()
- - def __getitem__(self, key): - if isinstance(key, str): - # to index columns by query as string name(s) - res = nested_query(self, key) - else: - # to index rows by query as integer index, slices, - # or iter of indices or bools - res = super().__getitem__(key) - return nested_obj_factory(res) - -
[docs] def process(self, - operators, - *, - work_dir=None, - exporter=None, - checkpointer=None, - tracer=None): - if operators is None: - return self - - if not isinstance(operators, list): - operators = [operators] - unforkable_operators = set(UNFORKABLE.modules.keys()) - - # resource utilization monitor - resource_util_list = [] - - dataset = self - try: - for op in operators: - mp_context = ['forkserver', 'spawn'] if ( - op.use_cuda() - or op._name in unforkable_operators) else None - setup_mp(mp_context) - - start = time() - # run single op - run_args = { - 'dataset': dataset, - 'exporter': exporter, - 'tracer': tracer, - } - dataset, resource_util_per_op = Monitor.monitor_func( - op.run, args=run_args) - # record processed ops - if checkpointer is not None: - checkpointer.record(op._op_cfg) - resource_util_list.append(resource_util_per_op) - end = time() - logger.info(f'OP [{op._name}] Done in {end - start:.3f}s. ' - f'Left {len(dataset)} samples.') - except: # noqa: E722 - logger.error(f'An error occurred during Op [{op._name}].') - traceback.print_exc() - exit(1) - finally: - if checkpointer and dataset is not self: - logger.info('Writing checkpoint of dataset processed by ' - 'last op...') - dataset.cleanup_cache_files() - checkpointer.save_ckpt(dataset) - if work_dir: - with open(os.path.join(work_dir, 'monitor.json'), 'w') as out: - json.dump(resource_util_list, out) - return dataset
- -
[docs] def map(self, *args, **kargs): - """Override the map func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" - if args: - args = list(args) - # the first positional para is function - if args[0] is None: - args[0] = lambda x: nested_obj_factory(x) - else: - args[0] = wrap_func_with_nested_access(args[0]) - called_func = args[0] - else: - if 'function' not in kargs or kargs['function'] is None: - kargs['function'] = lambda x: nested_obj_factory(x) - else: - kargs['function'] = wrap_func_with_nested_access( - kargs['function']) - called_func = kargs['function'] - - # For wrapped function, try to get its unwrapped (bound) method - while not inspect.ismethod(called_func) and hasattr( - called_func, '__wrapped__'): - called_func = called_func.__wrapped__ - - if inspect.ismethod(called_func): - # batched is required for fault-tolerant or batched OP - if callable(getattr( - called_func.__self__, - 'is_batched_op')) and called_func.__self__.is_batched_op( - ) or not getattr(called_func.__self__, 'turbo', False): - kargs['batched'] = True - kargs['batch_size'] = kargs.pop('batch_size', 1) if hasattr( - called_func.__self__, 'is_batched_op' - ) and called_func.__self__.is_batched_op() else 1 - else: - kargs['batched'] = False - - # rank is required for cuda model loading - if callable( - getattr(called_func.__self__, - 'use_cuda')) and called_func.__self__.use_cuda(): - kargs['with_rank'] = True - - if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: - new_fingerprint = generate_fingerprint(self, *args, **kargs) - kargs['new_fingerprint'] = new_fingerprint - - if cache_utils.CACHE_COMPRESS: - decompress(self, kargs['new_fingerprint'], - kargs['num_proc'] if 'num_proc' in kargs else 1) - - new_ds = NestedDataset(super().map(*args, **kargs)) - - if cache_utils.CACHE_COMPRESS: - compress(self, new_ds, - kargs['num_proc'] if 'num_proc' in kargs else 1) - - if self.need_to_cleanup_caches: - new_ds.cleanup_cache_files() - - return new_ds
- -
[docs] def filter(self, *args, **kargs): - """Override the filter func, which is called by most common operations, - such that the processed samples can be accessed by nested manner.""" - if args: - args = list(args) - # the first positional para is function - if args[0] is None: - args[0] = lambda x: nested_obj_factory(x) - else: - args[0] = wrap_func_with_nested_access(args[0]) - called_func = args[0] - else: - if 'function' not in kargs or kargs['function'] is None: - kargs['function'] = lambda x: nested_obj_factory(x) - else: - kargs['function'] = wrap_func_with_nested_access( - kargs['function']) - called_func = kargs['function'] - - # For wrapped function, try to get its unwrapped (bound) method - while not inspect.ismethod(called_func) and hasattr( - called_func, '__wrapped__'): - called_func = called_func.__wrapped__ - - # Batched is always required for fault tolerance - if inspect.ismethod(called_func): - if callable(getattr( - called_func.__self__, - 'is_batched_op')) and called_func.__self__.is_batched_op(): - kargs['batched'] = True - kargs['batch_size'] = kargs.pop('batch_size', 1) - - if 'new_fingerprint' not in kargs or kargs['new_fingerprint'] is None: - new_fingerprint = generate_fingerprint(self, *args, **kargs) - kargs['new_fingerprint'] = new_fingerprint - - # For filter, it involves a map and a filter operations, so the final - # cache files includes two sets with different fingerprint (before and - # after). So we need to decompress these two sets of compressed cache - # files - if cache_utils.CACHE_COMPRESS: - decompress(self, [kargs['new_fingerprint'], self._fingerprint], - kargs['num_proc'] if 'num_proc' in kargs else 1) - - # Turn off the compression due to it invokes map actually in the filter - # function. For cache file changes, map: A -> B, filter: A -> A, B. If - # we compress the caches of map, ops after filter cannot find the cache - # files A. So we turn off the inner cache compression for filter. - # Same for cleaning up cache files. - with CompressionOff(): - prev_state = self.need_to_cleanup_caches - self.need_to_cleanup_caches = False - new_ds = NestedDataset(super().filter(*args, **kargs)) - self.need_to_cleanup_caches = prev_state - - if cache_utils.CACHE_COMPRESS: - compress(self, new_ds, - kargs['num_proc'] if 'num_proc' in kargs else 1) - - if self.need_to_cleanup_caches: - new_ds.cleanup_cache_files() - - return new_ds
- -
[docs] def select(self, *args, **kargs): - """Override the select func, such that selected samples can be accessed - by nested manner.""" - return nested_obj_factory(super().select(*args, **kargs))
- -
[docs] @classmethod - def from_dict(cls, *args, **kargs): - """Override the from_dict func, which is called by most from_xx - constructors, such that the constructed dataset object is - NestedDataset.""" - return NestedDataset(super().from_dict(*args, **kargs))
- -
[docs] def add_column(self, *args, **kargs): - """Override the add column func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().add_column(*args, **kargs))
- -
[docs] def select_columns(self, *args, **kargs): - """Override the select columns func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().select_columns(*args, **kargs))
- -
[docs] def remove_columns(self, *args, **kargs): - """Override the remove columns func, such that the processed samples - can be accessed by nested manner.""" - return NestedDataset(super().remove_columns(*args, **kargs))
- -
[docs] def cleanup_cache_files(self): - """Override the cleanup_cache_files func, clear raw and compressed - cache files.""" - cleanup_compressed_cache_files(self) - return super().cleanup_cache_files()
- -
[docs] @staticmethod - def load_from_disk(*args, **kargs): - return NestedDataset(Dataset.load_from_disk(*args, **kargs))
- - -def nested_query(root_obj: Union[NestedDatasetDict, NestedDataset, - NestedQueryDict], key): - """ - Find item from a given object, by first checking flatten layer, then - checking nested layers. - - :param root_obj: the object - :param key: the stored item to be queried, e.g., "meta" or - "meta.date" - :return: - """ - subkeys = key.split('.') - - tmp = root_obj - for i in range(len(subkeys)): - try: - key_to_query = '.'.join(subkeys[i:len(subkeys)]) - if isinstance(tmp, - (NestedQueryDict, NestedDataset, NestedDatasetDict)): - # access field using base_class's func to avoid endless loop - res = super(type(tmp), tmp).__getitem__(key_to_query) - elif isinstance(tmp, list): - # NestedDataset may return multiple rows as list - res = [nested_query(item, key_to_query) for item in tmp] - else: - # NestedQueryDict may return single row - res = tmp[key_to_query] - if res is not None: - return res - except Exception as outer_get_error: - exist_in_dict = issubclass(type(tmp), dict) and \ - '.'.join(subkeys[i:i + 1]) in tmp - exist_in_dataset = issubclass(type(tmp), Dataset) and '.'.join( - subkeys[i:i + 1]) in tmp.features - if exist_in_dict or exist_in_dataset: - # dive into next level - tmp = nested_obj_factory(tmp['.'.join(subkeys[i:i + 1])]) - else: - logger.debug( - f'cannot find item given key={key} in dataset=' - f'{root_obj}. For the final caught outer-exception,' - f'type is: {type(outer_get_error)}, ' - f'info is: {outer_get_error}') - return None - - return None - - -def add_same_content_to_new_column(sample, - new_column_name, - initial_value=None): - """ - A helper function to speed up add_column function. Apply map on this - function in parallel instead of using add_column. - :param sample: a single sample to add this new column/field. - :param new_column_name: the name of this new column/field. - :param initial_value: the initial value of this new column/field. - """ - sample[new_column_name] = initial_value - return sample -
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/executor.html b/_modules/data_juicer/core/executor.html deleted file mode 100644 index 11f1f11fd..000000000 --- a/_modules/data_juicer/core/executor.html +++ /dev/null @@ -1,296 +0,0 @@ - - - - - - - - data_juicer.core.executor — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.executor

-import os
-from time import time
-from typing import Optional
-
-from jsonargparse import Namespace
-from loguru import logger
-from pydantic import PositiveInt
-
-from data_juicer.config import init_configs
-from data_juicer.core.data import Dataset
-from data_juicer.format.load import load_formatter
-from data_juicer.format.mixture_formatter import MixtureFormatter
-from data_juicer.ops import OPERATORS, load_ops
-from data_juicer.utils import cache_utils
-from data_juicer.utils.ckpt_utils import CheckpointManager
-
-from ..ops.selector.frequency_specified_field_selector import \
-    FrequencySpecifiedFieldSelector
-from ..ops.selector.topk_specified_field_selector import \
-    TopkSpecifiedFieldSelector
-from .exporter import Exporter
-from .tracer import Tracer
-
-
-
[docs]class Executor: - """ - This Executor class is used to process a specific dataset. - - It will load the dataset and unify the format, then apply all the - ops in the config file in order and generate a processed dataset. - """ - -
[docs] def __init__(self, cfg: Optional[Namespace] = None): - """ - Initialization method. - - :param cfg: optional jsonargparse Namespace. - """ - self.cfg = init_configs() if cfg is None else cfg - - self.work_dir = self.cfg.work_dir - - self.tracer = None - self.ckpt_manager = None - - # only enable it when using cache - if self.cfg.use_cache: - logger.info(f'Using cache compression method: ' - f'[{self.cfg.cache_compress}]') - cache_utils.CACHE_COMPRESS = self.cfg.cache_compress - - # setup formatter - logger.info('Setting up data formatter...') - self.formatter = load_formatter( - dataset_path=self.cfg.dataset_path, - generated_dataset_config=self.cfg.generated_dataset_config, - text_keys=self.cfg.text_keys, - suffixes=self.cfg.suffixes, - add_suffix=self.cfg.add_suffix) - - # whether to use checkpoint mechanism. If it's true, Executor will - # check if there are existing checkpoints first and try to load the - # checkpoints. If the checkpoints are loaded successfully, ops that - # have been processed will be skipped. - if self.cfg.use_checkpoint: - logger.info('Preparing checkpoint manager...') - self.ckpt_dir = os.path.join(self.work_dir, 'ckpt') - self.ckpt_manager = CheckpointManager(self.ckpt_dir, - self.cfg.process, - self.cfg.np) - if self.ckpt_manager.ckpt_available: - logger.info('Found existed dataset checkpoint.') - self.cfg.process = self.ckpt_manager.get_left_process_list() - - # prepare exporter and check export path suffix - logger.info('Preparing exporter...') - self.exporter = Exporter( - self.cfg.export_path, - self.cfg.export_shard_size, - self.cfg.export_in_parallel, - self.cfg.np, - keep_stats_in_res_ds=self.cfg.keep_stats_in_res_ds, - keep_hashes_in_res_ds=self.cfg.keep_hashes_in_res_ds) - - # setup tracer - self.open_tracer = self.cfg.open_tracer - if self.open_tracer: - logger.info('Preparing tracer...') - self.tracer = Tracer(self.work_dir, show_num=self.cfg.trace_num) - self.op_list_to_trace = self.cfg.op_list_to_trace - if len(self.cfg.op_list_to_trace) == 0: - logger.info('Trace for all ops.') - self.op_list_to_trace = set(OPERATORS.modules.keys())
- -
[docs] def sample_data(self, - dataset_to_sample: Dataset = None, - load_data_np=None, - sample_ratio: float = 1.0, - sample_algo: str = 'uniform', - **kwargs): - """ - Sample a subset from the given dataset. - - :param dataset_to_sample: Dataset to sample from. If None, will use - the formatter linked by the executor. Default is None. - :param load_data_np: number of workers when loading the dataset. - :param sample_ratio: The ratio of the sample size to the original - dataset size. Default is 1.0 (no sampling). - :param sample_algo: Sampling algorithm to use. Options are "uniform", - "frequency_specified_field_selector", or - "topk_specified_field_selector". - Default is "uniform". - :return: A sampled Dataset. - """ - # Determine the dataset to sample from - if dataset_to_sample is not None: - dataset = dataset_to_sample - elif self.cfg.use_checkpoint and self.ckpt_manager.ckpt_available: - logger.info('Loading dataset from checkpoint...') - dataset = self.ckpt_manager.load_ckpt() - elif hasattr(self, 'formatter'): - logger.info('Loading dataset from data formatter...') - if load_data_np is None: - load_data_np = self.cfg.np - dataset = self.formatter.load_dataset(load_data_np, self.cfg) - else: - raise ValueError('No dataset available to sample from.') - - # Perform sampling based on the specified algorithm - if sample_algo == 'uniform': - return MixtureFormatter.random_sample(dataset, sample_ratio) - elif sample_algo == 'frequency_specified_field_selector': - dj_op = FrequencySpecifiedFieldSelector(**kwargs) - return dj_op.process(dataset) - elif sample_algo == 'topk_specified_field_selector': - dj_op = TopkSpecifiedFieldSelector(**kwargs) - return dj_op.process(dataset) - else: - raise ValueError(f'Unsupported sample_algo: {sample_algo}')
- -
[docs] def run(self, - load_data_np: Optional[PositiveInt] = None, - skip_return=False): - """ - Running the dataset process pipeline. - - :param load_data_np: number of workers when loading the dataset. - :param skip_return: skip return for API called. - :return: processed dataset. - """ - # 1. format data - if self.cfg.use_checkpoint and self.ckpt_manager.ckpt_available: - logger.info('Loading dataset from checkpoint...') - dataset = self.ckpt_manager.load_ckpt() - else: - logger.info('Loading dataset from data formatter...') - if load_data_np is None: - load_data_np = self.cfg.np - dataset = self.formatter.load_dataset(load_data_np, self.cfg) - - # 2. extract processes - logger.info('Preparing process operators...') - ops = load_ops(self.cfg.process, self.cfg.op_fusion) - - # 3. data process - # - If tracer is open, trace each op after it's processed - # - If checkpoint is open, clean the cache files after each process - logger.info('Processing data...') - tstart = time() - dataset = dataset.process(ops, - work_dir=self.work_dir, - exporter=self.exporter, - checkpointer=self.ckpt_manager, - tracer=self.tracer) - tend = time() - logger.info(f'All OPs are done in {tend - tstart:.3f}s.') - - # 4. data export - logger.info('Exporting dataset to disk...') - self.exporter.export(dataset) - # compress the last dataset after exporting - if self.cfg.use_cache and self.cfg.cache_compress: - from data_juicer.utils.compress import compress - compress(dataset) - - if not skip_return: - return dataset
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/exporter.html b/_modules/data_juicer/core/exporter.html deleted file mode 100644 index e05cd3547..000000000 --- a/_modules/data_juicer/core/exporter.html +++ /dev/null @@ -1,375 +0,0 @@ - - - - - - - - data_juicer.core.exporter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.exporter

-import os
-from multiprocessing import Pool
-
-from loguru import logger
-
-from data_juicer.utils.constant import Fields, HashKeys
-
-
-
[docs]class Exporter: - """The Exporter class is used to export a dataset to files of specific - format.""" - - KiB = 2**10 # 1024 - MiB = 2**20 # 1024*1024 - GiB = 2**30 # 1024*1024*1024 - TiB = 2**40 # 1024*1024*1024*1024 - -
[docs] def __init__(self, - export_path, - export_shard_size=0, - export_in_parallel=True, - num_proc=1, - export_ds=True, - keep_stats_in_res_ds=False, - keep_hashes_in_res_ds=False, - export_stats=True): - """ - Initialization method. - - :param export_path: the path to export datasets. - :param export_shard_size: the size of each shard of exported - dataset. In default, it's 0, which means export the dataset - to a single file. - :param num_proc: number of process to export the dataset. - :param export_ds: whether to export the dataset contents. - :param keep_stats_in_res_ds: whether to keep stats in the result - dataset. - :param keep_hashes_in_res_ds: whether to keep hashes in the result - dataset. - :param export_stats: whether to export the stats of dataset. - """ - self.export_path = export_path - self.export_shard_size = export_shard_size - self.export_in_parallel = export_in_parallel - self.export_ds = export_ds - self.keep_stats_in_res_ds = keep_stats_in_res_ds - self.keep_hashes_in_res_ds = keep_hashes_in_res_ds - self.export_stats = export_stats - self.suffix = self._get_suffix(export_path) - self.num_proc = num_proc - self.max_shard_size_str = '' - - # get the string format of shard size - if self.export_shard_size // Exporter.TiB: - self.max_shard_size_str = '%.2f TiB' % (self.export_shard_size / - Exporter.TiB) - elif self.export_shard_size // Exporter.GiB: - self.max_shard_size_str = '%.2f GiB' % (self.export_shard_size / - Exporter.GiB) - elif self.export_shard_size // Exporter.MiB: - self.max_shard_size_str = '%.2f MiB' % (self.export_shard_size / - Exporter.MiB) - elif self.export_shard_size // Exporter.KiB: - self.max_shard_size_str = '%.2f KiB' % (self.export_shard_size / - Exporter.KiB) - else: - self.max_shard_size_str = '%.2f Bytes' % (self.export_shard_size) - - # we recommend users to set a shard size between MiB and TiB. - if 0 < self.export_shard_size < Exporter.MiB: - logger.warning(f'The export_shard_size [{self.max_shard_size_str}]' - f' is less than 1MiB. If the result dataset is too ' - f'large, there might be too many shard files to ' - f'generate.') - if self.export_shard_size >= Exporter.TiB: - logger.warning(f'The export_shard_size [{self.max_shard_size_str}]' - f' is larger than 1TiB. It might generate large ' - f'single shard file and make loading and exporting ' - f'slower.')
- - def _get_suffix(self, export_path): - """ - Get the suffix of export path and check if it's supported. - - We only support ["jsonl", "json", "parquet"] for now. - - :param export_path: the path to export datasets. - :return: the suffix of export_path. - """ - suffix = export_path.split('.')[-1].lower() - support_dict = self._router() - if suffix not in support_dict: - raise NotImplementedError(f'Suffix of export path [' - f'{export_path}] is not supported ' - f'for now. Only support ' - f'{list(support_dict.keys())}.') - return suffix - - def _export_impl(self, dataset, export_path, suffix, export_stats=True): - """ - Export a dataset to specific path. - - :param dataset: the dataset to export. - :param export_path: the path to export the dataset. - :param suffix: suffix of export path. - :param export_stats: whether to export stats of dataset. - :return: - """ - if Fields.stats in dataset.features and export_stats: - # export stats of datasets into a single file. - logger.info('Exporting computed stats into a single file...') - ds_stats = dataset.select_columns(Fields.stats) - stats_file = export_path.replace('.' + suffix, '_stats.jsonl') - Exporter.to_jsonl( - ds_stats, - stats_file, - num_proc=self.num_proc if self.export_in_parallel else 1) - - if self.export_ds: - # fetch the corresponding export method according to the suffix - if not self.keep_stats_in_res_ds: - extra_fields = {Fields.stats} - feature_fields = set(dataset.features.keys()) - removed_fields = extra_fields.intersection(feature_fields) - dataset = dataset.remove_columns(removed_fields) - if not self.keep_hashes_in_res_ds: - extra_fields = { - HashKeys.hash, - HashKeys.minhash, - HashKeys.simhash, - HashKeys.imagehash, - HashKeys.videohash, - } - feature_fields = set(dataset.features.keys()) - removed_fields = extra_fields.intersection(feature_fields) - dataset = dataset.remove_columns(removed_fields) - export_method = Exporter._router()[suffix] - if self.export_shard_size <= 0: - # export the whole dataset into one single file. - logger.info('Export dataset into a single file...') - export_method( - dataset, - export_path, - num_proc=self.num_proc if self.export_in_parallel else 1) - else: - # compute the dataset size and number of shards to split - if dataset._indices is not None: - dataset_nbytes = dataset.data.nbytes * len( - dataset._indices) / len(dataset.data) - else: - dataset_nbytes = dataset.data.nbytes - num_shards = int(dataset_nbytes / self.export_shard_size) + 1 - num_shards = min(num_shards, len(dataset)) - - # split the dataset into multiple shards - logger.info(f'Split the dataset to export into {num_shards} ' - f'shards. Size of each shard <= ' - f'{self.max_shard_size_str}') - shards = [ - dataset.shard(num_shards=num_shards, - index=i, - contiguous=True) for i in range(num_shards) - ] - len_num = len(str(num_shards)) + 1 - num_fmt = f'%0{len_num}d' - - # regard the export path as a directory and set file names for - # each shard - dirname = os.path.dirname(os.path.abspath(self.export_path)) - basename = os.path.basename(self.export_path).split('.')[0] - os.makedirs(dirname, exist_ok=True) - filenames = [ - os.path.join( - dirname, f'{basename}-{num_fmt % index}-of-' - f'{num_fmt % num_shards}' - f'.{self.suffix}') for index in range(num_shards) - ] - - # export dataset into multiple shards using multiprocessing - logger.info(f'Start to exporting to {num_shards} shards.') - pool = Pool(self.num_proc) - for i in range(num_shards): - pool.apply_async(export_method, - args=( - shards[i], - filenames[i], - )) - pool.close() - pool.join() - -
[docs] def export(self, dataset): - """ - Export method for a dataset. - - :param dataset: the dataset to export. - :return: - """ - self._export_impl(dataset, self.export_path, self.suffix, - self.export_stats)
- -
[docs] def export_compute_stats(self, dataset, export_path): - """ - Export method for saving compute status in filters - """ - keep_stats_in_res_ds = self.keep_stats_in_res_ds - self.keep_stats_in_res_ds = True - self._export_impl(dataset, - export_path, - self.suffix, - export_stats=False) - self.keep_stats_in_res_ds = keep_stats_in_res_ds
- -
[docs] @staticmethod - def to_jsonl(dataset, export_path, num_proc=1, **kwargs): - """ - Export method for jsonl target files. - - :param dataset: the dataset to export. - :param export_path: the path to store the exported dataset. - :param num_proc: the number of processes used to export the dataset. - :param kwargs: extra arguments. - :return: - """ - dataset.to_json(export_path, force_ascii=False, num_proc=num_proc)
- -
[docs] @staticmethod - def to_json(dataset, export_path, num_proc=1, **kwargs): - """ - Export method for json target files. - - :param dataset: the dataset to export. - :param export_path: the path to store the exported dataset. - :param num_proc: the number of processes used to export the dataset. - :param kwargs: extra arguments. - :return: - """ - dataset.to_json(export_path, - force_ascii=False, - num_proc=num_proc, - lines=False)
- -
[docs] @staticmethod - def to_parquet(dataset, export_path, **kwargs): - """ - Export method for parquet target files. - - :param dataset: the dataset to export. - :param export_path: the path to store the exported dataset. - :param kwargs: extra arguments. - :return: - """ - dataset.to_parquet(export_path)
- - # suffix to export method - @staticmethod - def _router(): - """ - A router from different suffixes to corresponding export methods. - - :return: A dict router. - """ - return { - 'jsonl': Exporter.to_jsonl, - 'json': Exporter.to_json, - 'parquet': Exporter.to_parquet, - }
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/monitor.html b/_modules/data_juicer/core/monitor.html deleted file mode 100644 index 9a206c04d..000000000 --- a/_modules/data_juicer/core/monitor.html +++ /dev/null @@ -1,324 +0,0 @@ - - - - - - - - data_juicer.core.monitor — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.monitor

-import time
-from functools import partial
-from multiprocessing import get_context
-
-from data_juicer.utils.resource_utils import (get_cpu_count,
-                                              get_cpu_utilization,
-                                              query_cuda_info, query_mem_info)
-
-
-def resource_monitor(mdict, interval):
-    # function to monitor the resource
-    # interval is the sampling interval
-    this_states = []
-    while True:
-        this_states.append(Monitor.monitor_current_resources())
-        time.sleep(interval)
-        if mdict['stop']:
-            break
-    mdict['resource'] = this_states
-
-
-
[docs]class Monitor: - """ - Monitor resource utilization and other information during the data - processing. - - Resource utilization dict: (for each func) - '''python - { - 'time': 10, - 'resource': [ - { - 'timestamp': xxx, - 'CPU count': xxx, - 'GPU free mem.': xxx. - ... - }, - { - 'timestamp': xxx, - 'CPU count': xxx, - 'GPU free mem.': xxx, - ... - }, - ] - } - ''' - - Based on the structure above, the resource utilization analysis result will - add several extra fields on the first level: - '''python - { - 'time': 10, - 'resource': [...], - 'resource_analysis': { - 'GPU free mem.': { - 'max': xxx, - 'min': xxx, - 'avg': xxx, - }, - ... - } - } - ''' - Only those fields in DYNAMIC_FIELDS will be analyzed. - """ - - DYNAMIC_FIELDS = { - 'CPU util.', - 'Used mem.', - 'Free mem.', - 'Available mem.', - 'Mem. util.', - 'GPU free mem.', - 'GPU used mem.', - 'GPU util.', - } - -
[docs] def __init__(self): - pass
- -
[docs] def monitor_all_resources(self): - """ - Detect the resource utilization of all distributed nodes. - """ - # TODO - raise NotImplementedError
- -
[docs] @staticmethod - def monitor_current_resources(): - """ - Detect the resource utilization of the current environment/machine. - All data of "util." is ratios in the range of [0.0, 1.0]. All data of - "mem." is in MB. - """ - resource_dict = dict() - # current time - resource_dict['timestamp'] = time.time() - - # CPU - resource_dict['CPU count'] = get_cpu_count() - resource_dict['CPU util.'] = get_cpu_utilization() / 100.0 - resource_dict['Total mem.'] = query_mem_info('total') - resource_dict['Used mem.'] = query_mem_info('used') - resource_dict['Free mem.'] = query_mem_info('free') - resource_dict['Available mem.'] = query_mem_info('available') - resource_dict['Mem. util.'] = resource_dict[ - 'Used mem.'] / resource_dict['Total mem.'] - - # GPU - resource_dict['GPU total mem.'] = query_cuda_info('memory.total') - resource_dict['GPU free mem.'] = query_cuda_info('memory.free') - resource_dict['GPU used mem.'] = query_cuda_info('memory.used') - resource_dict['GPU util.'] = query_cuda_info('utilization.gpu') - if resource_dict['GPU util.']: - resource_dict['GPU util.'] = [ - x / 100.0 for x in resource_dict['GPU util.'] - ] - - return resource_dict
- -
[docs] @staticmethod - def analyze_resource_util_list(resource_util_list): - """ - Analyze the resource utilization for a given resource util list. - Compute {'max', 'min', 'avg'} of resource metrics for each dict item. - """ - res_list = [] - for item in resource_util_list: - res_list.append(Monitor.analyze_single_resource_util(item)) - return res_list
- -
[docs] @staticmethod - def analyze_single_resource_util(resource_util_dict): - """ - Analyze the resource utilization for a single resource util dict. - Compute {'max', 'min', 'avg'} of each resource metrics. - """ - analysis_res = {} - record_list = {} - for record in resource_util_dict['resource']: - for key in Monitor.DYNAMIC_FIELDS: - if key in record: - if record[key] is None: - continue - elif isinstance(record[key], list): - record_list.setdefault(key, []).extend(record[key]) - else: - record_list.setdefault(key, []).append(record[key]) - - # analyze the max, min, and avg - for key in record_list: - analysis_res[key] = { - 'max': max(record_list[key]), - 'min': min(record_list[key]), - 'avg': sum(record_list[key]) / len(record_list[key]), - } - resource_util_dict['resource_analysis'] = analysis_res - - return resource_util_dict
- -
[docs] @staticmethod - def monitor_func(func, args=None, sample_interval=0.5): - """ - Process the input dataset and probe related information for each OP in - the specified operator list. - - For now, we support the following targets to probe: - "resource": resource utilization for each OP. - "speed": average processing speed for each OP. - - The probe result is a list and each item in the list is the probe - result for each OP. - """ - if args is None: - args = {} - if isinstance(args, dict): - func = partial(func, **args) - elif isinstance(args, list) or isinstance(args, tuple): - func = partial(func, *args) - else: - func = partial(func, args) - - # resource utilization dict - resource_util_dict = {} - - # start monitor - ctx = get_context('fork') - with ctx.Manager() as manager: - mdict = manager.dict() - mdict['stop'] = False - monitor_proc = ctx.Process(target=resource_monitor, - args=( - mdict, - sample_interval, - )) - monitor_proc.start() - # start timer - start = time.time() - - # run single op - ret = func() - - # end timer - end = time.time() - - # stop monitor - mdict['stop'] = True - monitor_proc.join() - - resource_util_dict['resource'] = mdict['resource'] - - # calculate speed - resource_util_dict['time'] = end - start - - return ret, resource_util_dict
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/core/tracer.html b/_modules/data_juicer/core/tracer.html deleted file mode 100644 index adab73f74..000000000 --- a/_modules/data_juicer/core/tracer.html +++ /dev/null @@ -1,333 +0,0 @@ - - - - - - - - data_juicer.core.tracer — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.core.tracer

-import os
-
-import pandas as pd
-from datasets import Dataset
-from loguru import logger
-
-
-
[docs]class Tracer: - """ - The tracer to trace the sample changes before and after an operator - process. - - The comparison results will be stored in the work directory. - """ - -
[docs] def __init__(self, work_dir, show_num=10): - """ - Initialization method. - - :param work_dir: the work directory to store the comparison - results - :param show_num: the maximum number of samples to show in the - comparison result files. - """ - self.work_dir = os.path.join(work_dir, 'trace') - if not os.path.exists(self.work_dir): - os.makedirs(self.work_dir) - self.show_num = show_num
- -
[docs] def trace_mapper(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset, text_key: str): - """ - Compare datasets before and after a Mapper. - - This will mainly show the different sample pairs due to the - modification by the Mapper - - :param op_name: the op name of mapper - :param previous_ds: dataset before the mapper process - :param processed_ds: dataset processed by the mapper - :param text_key: which text_key to trace - :return: - """ - assert len(previous_ds) == len(processed_ds) - dif_dict = [] - num = 0 - - # Find different samples orderly between previous and processed - # datasets until the total number of found sample pairs is enough. - for i in range(len(previous_ds)): - previous_sample = previous_ds[i][text_key] - processed_sample = processed_ds[i][text_key] - if previous_sample != processed_sample: - dif_dict.append({ - 'original text': previous_sample, - 'processed_text': processed_sample, - }) - num += 1 - if num >= self.show_num: - break - - if len(dif_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(dif_dict) < self.show_num: - logger.warning(f'There are {len(dif_dict)} different samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'mapper-{op_name}.jsonl' - dif_df = pd.DataFrame(dif_dict) - dif_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_batch_mapper(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset, text_key: str): - """ - Compare datasets before and after a BatchMapper. - - This will mainly show the new samples augmented by the BatchMapper - - :param op_name: the op name of mapper - :param previous_ds: dataset before the mapper process - :param processed_ds: dataset processed by the mapper - :param text_key: which text_key to trace - :return: - """ - assert previous_ds[0][text_key] == processed_ds[0][text_key] - aug_dict = [] - - # Get the first samples - for i in range(len(processed_ds)): - processed_sample = processed_ds[i] - aug_dict.append(processed_sample) - if i + 1 >= self.show_num: - break - - if len(aug_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are ' - f'empty. Thus no comparison results would be ' - f'generated.') - return - elif len(aug_dict) < self.show_num: - logger.warning(f'There are only {len(aug_dict)} samples -- less ' - f'than expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'mapper-{op_name}.jsonl' - dif_df = pd.DataFrame(aug_dict) - dif_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_filter(self, op_name: str, previous_ds: Dataset, - processed_ds: Dataset): - """ - Compare datasets before and after a Filter. - - This will mainly show the filtered samples by the Filter - - :param op_name: the op name of filter - :param previous_ds: dataset before the filter process - :param processed_ds: dataset processed by the filter - :return: - """ - if len(previous_ds) == len(processed_ds): - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - - # get the number of filtered samples. - total_dif_num = len(previous_ds) - len(processed_ds) - # index of the current sample in the previous dataset - i = 0 - filter_dict = [] - # number of found filtered samples. It's the offset bewteen two - # datasets as well. - num = 0 - while i < len(previous_ds): - if i - num >= len(processed_ds) or \ - previous_ds[i] != processed_ds[i - num]: - # 1. If all samples in processed dataset are checked but there - # still some samples left in the previous dataset, all of these - # left samples are filtered. - # 2. If the corresponding samples in previous and processed - # datasets are different, samples in the previous dataset are - # filtered. - num += 1 - filter_dict.append(previous_ds[i]) - if num >= self.show_num or num >= total_dif_num: - # If the total number of found filtered samples is enough or we - # have found all filtered samples, just stop. - break - i += 1 - if len(filter_dict) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(filter_dict) < self.show_num: - logger.warning(f'There are {len(filter_dict)} filtered samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # export the tracer results. - res_name = f'filter-{op_name}.jsonl' - filter_df = pd.DataFrame(filter_dict) - filter_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
- -
[docs] def trace_deduplicator(self, op_name: str, dup_pairs: list): - """ - Compare datasets before and after a Deduplicator. - - This will mainly show the near-duplicate sample pairs extracted - by the Deduplicator. Different from the other two trace methods, - the trace process for deduplicator is embedded into the process - method of deduplicator, but the other two trace methods are - independent of the process method of mapper and filter operators - - :param op_name: the op name of deduplicator - :param dup_pairs: duplicate sample pairs obtained from - deduplicator - :return: - """ - if dup_pairs is None: - logger.warning(f'Op [{op_name}] does not generate dup_pairs ' - f'correctly, thus no comparison results can be ' - f'obtained from this op.') - return - if len(dup_pairs) == 0: - logger.warning(f'Datasets before and after op [{op_name}] are all ' - f'the same. Thus no comparison results would be ' - f'generated.') - return - elif len(dup_pairs) < self.show_num: - logger.warning(f'There are {len(dup_pairs)} filtered samples ' - f'before and after op [{op_name}] -- less than ' - f'expected {self.show_num} samples.') - - # reorganize the duplicate pairs - dup_dict = [] - for key in dup_pairs: - dup_dict.append({ - 'dup1': dup_pairs[key][0], - 'dup2': dup_pairs[key][1], - }) - - # export the tracer result. - res_name = f'duplicate-{op_name}.jsonl' - dup_df = pd.DataFrame(dup_dict) - dup_df.to_json(os.path.join(self.work_dir, res_name), - orient='records', - lines=True, - force_ascii=False)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/base_op.html b/_modules/data_juicer/ops/base_op.html deleted file mode 100644 index 698c73c71..000000000 --- a/_modules/data_juicer/ops/base_op.html +++ /dev/null @@ -1,598 +0,0 @@ - - - - - - - - data_juicer.ops.base_op — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.base_op

-import copy
-import traceback
-from functools import wraps
-
-import numpy as np
-import pyarrow as pa
-from loguru import logger
-
-from data_juicer import is_cuda_available
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.mm_utils import size_to_bytes
-from data_juicer.utils.process_utils import calculate_np
-from data_juicer.utils.registry import Registry
-
-OPERATORS = Registry('Operators')
-UNFORKABLE = Registry('Unforkable')
-
-
-def convert_list_dict_to_dict_list(samples):
-    # reconstruct samples from "list of dicts" to "dict of lists"
-    keys = samples[0].keys()
-    res_samples = {}
-    for key in keys:
-        res_samples[key] = [s[key] for s in samples]
-    return res_samples
-
-
-def convert_dict_list_to_list_dict(samples):
-    # reconstruct samples from "dict of lists" to "list of dicts"
-    reconstructed_samples = []
-    keys = list(samples.keys())
-    # take any key, since they should be of same length
-    for i in range(len(samples[keys[0]])):
-        reconstructed_samples.append({key: samples[key][i] for key in samples})
-    return reconstructed_samples
-
-
-def convert_arrow_to_python(method):
-
-    @wraps(method)
-    def wrapper(sample, *args, **kwargs):
-        if isinstance(sample, pa.Table):
-            sample = sample.to_pydict()
-        return method(sample, *args, **kwargs)
-
-    return wrapper
-
-
-def catch_map_batches_exception(method):
-    """
-    For batched-map sample-level fault tolerance.
-    """
-
-    @wraps(method)
-    @convert_arrow_to_python
-    def wrapper(samples, *args, **kwargs):
-        try:
-            return method(samples, *args, **kwargs)
-        except Exception as e:
-            from loguru import logger
-            logger.error(
-                f'An error occurred in mapper operation when processing '
-                f'samples {samples}, {type(e)}: {e}')
-            traceback.print_exc()
-            ret = {key: [] for key in samples.keys()}
-            ret[Fields.stats] = []
-            ret[Fields.source_file] = []
-            return ret
-
-    return wrapper
-
-
-def catch_map_single_exception(method):
-    """
-    For single-map sample-level fault tolerance.
-    The input sample is expected batch_size = 1.
-    """
-
-    def is_batched(sample):
-        val_iter = iter(sample.values())
-        first_val = next(val_iter)
-        if not isinstance(first_val, list):
-            return False
-        first_len = len(first_val)
-        return all(
-            isinstance(val, list) and len(val) == first_len
-            for val in val_iter)
-
-    @wraps(method)
-    @convert_arrow_to_python
-    def wrapper(sample, *args, **kwargs):
-        if is_batched(sample):
-            try:
-                sample = convert_dict_list_to_list_dict(sample)[0]
-                res_sample = method(sample, *args, **kwargs)
-                return convert_list_dict_to_dict_list([res_sample])
-            except Exception as e:
-                from loguru import logger
-                logger.error(
-                    f'An error occurred in mapper operation when processing '
-                    f'sample {sample}, {type(e)}: {e}')
-                traceback.print_exc()
-                ret = {key: [] for key in sample.keys()}
-                ret[Fields.stats] = []
-                ret[Fields.source_file] = []
-                return ret
-        else:
-            # without fault tolerance
-            return method(sample, *args, **kwargs)
-
-    return wrapper
-
-
-class OP:
-
-    _accelerator = 'cpu'
-    _batched_op = False
-
-    def __init__(self, *args, **kwargs):
-        """
-        Base class of operators.
-
-        :param text_key: the key name of field that stores sample texts
-            to be processed.
-        :param image_key: the key name of field that stores sample image list
-            to be processed
-        :param audio_key: the key name of field that stores sample audio list
-            to be processed
-        :param video_key: the key name of field that stores sample video list
-            to be processed
-        """
-        # init data keys
-        self.text_key = kwargs.get('text_key', 'text')
-        self.image_key = kwargs.get('image_key', 'images')
-        self.audio_key = kwargs.get('audio_key', 'audios')
-        self.video_key = kwargs.get('video_key', 'videos')
-
-        self.query_key = kwargs.get('query_key', 'query')
-        self.response_key = kwargs.get('response_key', 'response')
-        self.history_key = kwargs.get('history_key', 'history')
-
-        self.batch_size = kwargs.get('batch_size', 1000)
-
-        # whether the model can be accelerated using cuda
-        _accelerator = kwargs.get('accelerator', None)
-        if _accelerator is not None:
-            self.accelerator = _accelerator
-        else:
-            self.accelerator = self._accelerator
-
-        # parameters to determind the number of procs for this op
-        self.num_proc = kwargs.get('num_proc', None)
-        self.cpu_required = kwargs.get('cpu_required', 1)
-        self.mem_required = kwargs.get('mem_required', 0)
-        if isinstance(self.mem_required, str):
-            self.mem_required = size_to_bytes(self.mem_required) / 1024**3
-
-        self.turbo = kwargs.get('turbo', False)
-
-        # nested wrappers
-        from data_juicer.core.data import wrap_func_with_nested_access
-        for name in ['process', 'compute_stats', 'compute_hash']:
-            method = getattr(self, name, None)
-            if method and callable(method):
-                setattr(self, f'_{name}', method)
-                method = wrap_func_with_nested_access(method)
-                setattr(self, name, method)
-
-    @classmethod
-    def is_batched_op(cls):
-        return cls._batched_op
-
-    def process(self, *args, **kwargs):
-        raise NotImplementedError
-
-    def use_cuda(self):
-        return self.accelerator == 'cuda' and is_cuda_available()
-
-    def runtime_np(self):
-        op_proc = calculate_np(self._name, self.mem_required,
-                               self.cpu_required, self.num_proc,
-                               self.use_cuda())
-        logger.debug(
-            f'Op [{self._name}] running with number of procs:{op_proc}')
-        return op_proc
-
-    def remove_extra_parameters(self, param_dict, keys=None):
-        """
-            at the begining of the init of the mapper op, call
-            self.remove_extra_parameters(locals())
-            to get the init parameter dict of the op for convenience
-
-        """
-        if keys is None:
-            param_dict = {
-                k: v
-                for k, v in param_dict.items() if not k.startswith('_')
-            }
-            param_dict.pop('self', None)
-        else:
-            param_dict = {k: v for k, v in param_dict.items() if k not in keys}
-        return param_dict
-
-    def add_parameters(self, init_parameter_dict, **extra_param_dict):
-        """
-            add parameters for each sample, need to keep extra_param_dict
-            and init_parameter_dict unchanged.
-        """
-        related_parameters = copy.deepcopy(init_parameter_dict)
-        related_parameters.update(extra_param_dict)
-        return related_parameters
-
-    def run(self, dataset):
-        from data_juicer.core.data import NestedDataset
-        if not isinstance(dataset, NestedDataset):
-            dataset = NestedDataset(dataset)
-        return dataset
-
-    def empty_history(self):
-        return np.empty((0, 0), dtype=str)
-
-
-
[docs]class Mapper(OP): - -
[docs] def __init__(self, *args, **kwargs): - """ - Base class that conducts data editing. - - :param text_key: the key name of field that stores sample texts - to be processed. - :param image_key: the key name of field that stores sample image list - to be processed - :param audio_key: the key name of field that stores sample audio list - to be processed - :param video_key: the key name of field that stores sample video list - to be processed - """ - super(Mapper, self).__init__(*args, **kwargs) - - # runtime wrappers - if self.is_batched_op(): - self.process = catch_map_batches_exception(self.process_batched) - else: - self.process = catch_map_single_exception(self.process_single)
- - # set the process method is not allowed to be overridden - def __init_subclass__(cls, **kwargs): - not_allowed_list = ['process'] - for method_name in not_allowed_list: - if method_name in cls.__dict__: - raise TypeError( - f'Method {method_name} cannot be overridden by subclass ' - f'{cls.__name__}. Please implement {method_name}_single ' - f'or {method_name}_batched.') - -
[docs] def process_batched(self, samples, *args, **kwargs): - keys = samples.keys() - first_key = next(iter(keys)) - num_samples = len(samples[first_key]) - for i in range(num_samples): - this_sample = {key: samples[key][i] for key in keys} - res_sample = self.process_single(this_sample, *args, **kwargs) - for key in keys: - samples[key][i] = res_sample[key] - - return samples
- -
[docs] def process_single(self, sample): - """ - For sample level, sample --> sample - - :param sample: sample to process - :return: processed sample - """ - raise NotImplementedError
- -
[docs] def run(self, dataset, *, exporter=None, tracer=None): - dataset = super(Mapper, self).run(dataset) - new_dataset = dataset.map( - self.process, - num_proc=self.runtime_np(), - with_rank=self.use_cuda(), - batch_size=self.batch_size, - desc=self._name + '_process', - ) - if tracer: - tracer.trace_mapper(self._name, dataset, new_dataset, - self.text_key) - return new_dataset
- - -
[docs]class Filter(OP): - -
[docs] def __init__(self, *args, **kwargs): - """ - Base class that removes specific info. - - :param text_key: the key name of field that stores sample texts - to be processed - :param image_key: the key name of field that stores sample image list - to be processed - :param audio_key: the key name of field that stores sample audio list - to be processed - :param video_key: the key name of field that stores sample video list - to be processed - """ - super(Filter, self).__init__(*args, **kwargs) - self.stats_export_path = kwargs.get('stats_export_path', None) - - # runtime wrappers - if self.is_batched_op(): - self.compute_stats = catch_map_batches_exception( - self.compute_stats_batched) - self.process = catch_map_batches_exception(self.process_batched) - else: - self.compute_stats = catch_map_single_exception( - self.compute_stats_single) - self.process = catch_map_single_exception(self.process_single)
- - # set the process method is not allowed to be overridden - def __init_subclass__(cls, **kwargs): - not_allowed_list = ['compute_stats', 'process'] - for method_name in not_allowed_list: - if method_name in cls.__dict__: - raise TypeError( - f'Method {method_name} cannot be overridden by subclass ' - f'{cls.__name__}. Please implement {method_name}_single ' - f'or {method_name}_batched.') - -
[docs] def compute_stats_batched(self, samples, *args, **kwargs): - keys = samples.keys() - num_samples = len(samples[Fields.stats]) - for i in range(num_samples): - this_sample = {key: samples[key][i] for key in keys} - res_sample = self.compute_stats_single(this_sample, *args, - **kwargs) - samples[Fields.stats][i] = res_sample[Fields.stats] - if 'context' in kwargs and kwargs['context']: - samples[Fields.context][i] = res_sample[Fields.context] - - return samples
- -
[docs] def process_batched(self, samples): - return map(lambda stat: self.process_single({Fields.stats: stat}), - samples[Fields.stats])
- -
[docs] def compute_stats_single(self, sample, context=False): - """ - Compute stats for the sample which is used as a metric to decide - whether to filter this sample. - - :param sample: input sample. - :param context: whether to store context information of intermediate - vars in the sample temporarily. - :return: sample with computed stats - """ - raise NotImplementedError
- -
[docs] def process_single(self, sample): - """ - For sample level, sample --> Boolean. - - :param sample: sample to decide whether to filter - :return: true for keeping and false for filtering - """ - raise NotImplementedError
- -
[docs] def run(self, dataset, *, exporter=None, tracer=None, reduce=True): - dataset = super(Filter, self).run(dataset) - if Fields.stats not in dataset.features: - from data_juicer.core.data import add_same_content_to_new_column - dataset = dataset.map(add_same_content_to_new_column, - fn_kwargs={ - 'new_column_name': Fields.stats, - 'initial_value': {} - }, - num_proc=self.runtime_np(), - batch_size=self.batch_size, - desc='Adding new column for stats') - dataset = dataset.map(self.compute_stats, - num_proc=self.runtime_np(), - with_rank=self.use_cuda(), - batch_size=self.batch_size, - desc=self._name + '_compute_stats') - if exporter and self.stats_export_path is not None: - exporter.export_compute_stats(dataset, self.stats_export_path) - if reduce: - new_dataset = dataset.filter(self.process, - num_proc=self.runtime_np(), - batch_size=self.batch_size, - desc=self._name + '_process') - if tracer: - tracer.trace_filter(self._name, dataset, new_dataset) - return new_dataset - else: - return dataset
- - -
[docs]class Deduplicator(OP): - -
[docs] def __init__(self, *args, **kwargs): - """ - Base class that conducts deduplication. - - :param text_key: the key name of field that stores sample texts - to be processed - :param image_key: the key name of field that stores sample image list - to be processed - :param audio_key: the key name of field that stores sample audio list - to be processed - :param video_key: the key name of field that stores sample video list - to be processed - """ - super(Deduplicator, self).__init__(*args, **kwargs) - - # runtime wrappers - if self.is_batched_op(): - self.compute_hash = catch_map_batches_exception(self.compute_hash) - else: - self.compute_hash = catch_map_single_exception(self.compute_hash)
- -
[docs] def compute_hash(self, sample): - """ - Compute hash values for the sample. - - :param sample: input sample - :return: sample with computed hash value. - """ - raise NotImplementedError
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - raise NotImplementedError
- -
[docs] def run(self, dataset, *, exporter=None, tracer=None, reduce=True): - dataset = super(Deduplicator, self).run(dataset) - dataset = dataset.map(self.compute_hash, - num_proc=self.runtime_np(), - with_rank=self.use_cuda(), - desc=self._name + '_compute_hash') - if reduce: - show_num = tracer.show_num if tracer else 0 - new_dataset, dup_pairs = self.process(dataset, show_num) - if tracer: - tracer.trace_deduplicator(self._name, dup_pairs) - return new_dataset - else: - return dataset
- - -
[docs]class Selector(OP): - -
[docs] def __init__(self, *args, **kwargs): - """ - Base class that conducts selection in dataset-level. - - :param text_key: the key name of field that stores sample texts - to be processed - :param image_key: the key name of field that stores sample image list - to be processed - :param audio_key: the key name of field that stores sample audio list - to be processed - :param video_key: the key name of field that stores sample video list - to be processed - """ - super(Selector, self).__init__(*args, **kwargs)
- -
[docs] def process(self, dataset): - """ - Dataset --> dataset. - - :param dataset: input dataset - :return: selected dataset. - """ - raise NotImplementedError
- -
[docs] def run(self, dataset, *, exporter=None, tracer=None): - dataset = super(Selector, self).run(dataset) - new_dataset = self.process(dataset) - if tracer: - tracer.trace_filter(self._name, dataset, new_dataset) - return new_dataset
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/common/helper_func.html b/_modules/data_juicer/ops/common/helper_func.html deleted file mode 100644 index 4ef4eca58..000000000 --- a/_modules/data_juicer/ops/common/helper_func.html +++ /dev/null @@ -1,309 +0,0 @@ - - - - - - - - data_juicer.ops.common.helper_func — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.common.helper_func

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-from typing import Dict
-
-import regex as re
-
-
-class UnionFind:
-
-    def __init__(self):
-        """Initialization method."""
-        self.parent: Dict[int, int] = {}
-
-    def find(self, x):
-        if x not in self.parent:
-            self.parent[x] = x
-        if self.parent[x] != x:
-            self.parent[x] = self.find(self.parent[x])
-        return self.parent[x]
-
-    def union(self, x, y):
-        px = self.find(x)
-        py = self.find(y)
-        self.parent[px] = self.parent[py] = min(px, py)
-
-
-
[docs]def strip(document, strip_characters): - """ - Way faster than document.strip(strip_characters) since strip_characters is - now a set instead of a str, and it contains a lot of elements (all the - emojis). - - :param document: document to be processed - :param strip_characters: characters used for stripping document - :return: stripped document - """ - if not document: - return document - beg_ind = 0 - end_ind = len(document) - for i in range(len(document)): - if document[i] in strip_characters: - beg_ind += 1 - else: - break - for i in range(1, len(document) + 1): - if document[-i] in strip_characters: - end_ind -= 1 - else: - break - document_stripped = document[beg_ind:end_ind] - return document_stripped
- - -
[docs]def split_on_whitespace(document, new_line=False, tab=False): - """ - This method also removes concatenated spaces. - - :param document: document to be splited - :param new_line: whether to split document with '\\\\n' - :param tag: whether to split document with '\\\\t' - :return: word list obtained after splitting document - """ - sep = [' '] + new_line * ['\n'] + tab * ['\t'] - sep = '|'.join(sep) - split_document = re.split(sep, document) - split_document = [word for word in split_document if word] - return split_document
- - -
[docs]def split_on_newline_tab_whitespace(document): - """ - This method is used to split the document into different levels of sub- - sentences. - - First split on "\\\\n", then on "\\\\t", then on " ". - :param document: document to be splited - :return: sentence list obtained after splitting document - """ - sentences = document.split('\n') - sentences = [sentence.split('\t') for sentence in sentences] - sentences = [[ - split_on_whitespace(subsentence) for subsentence in sentence - ] for sentence in sentences] - return sentences
- - -
[docs]def merge_on_whitespace_tab_newline(sentences): - """ - This method is used to merge different levels of sub-sentences into one - document. Invert the method split_on_newline_tab_whitespace. Removes - concatenated separators. - - :param sentences: sentence list to be merged - :return: document obtained after merging sub-sentences - """ - sentences = [[ - ' '.join(subsentence) for subsentence in sentence if subsentence - ] for sentence in sentences] - sentences = ['\t'.join(sentence) for sentence in sentences if sentence] - if not sentences: - return '' - document = '\n'.join(sentences) - return document
- - -
[docs]def words_augmentation(words, group_size, join_char): - """ - Augment words, especially for Chinese (without a space between words) and - Vietnamese (with a space between syllables). - - :param word: word list to be augmented - :param group_size: the size of word groups that need to be merged - :param join_char: characters to be added between word group - :return: word list after augment - """ - augmentation = [ - join_char.join(words[i:i + group_size]) - for i in range(len(words) - group_size + 1) - ] - return augmentation
- - -
[docs]def get_words_from_document( - document, - token_func=None, - new_line=True, - tab=True, -): - """ - Get words from a document. Useful to compute ratios, like the - stopwords ratio. - - :param document: document that need to split words. - :param token_func: function of tokenizer, if specified, the function - will be used for split document into different tokens. - :param new_line: whether to use '\\\\n' to split words. - :param tab: whether to use '\\\\t' to split words. - :return: word list obtained from document - """ - if token_func: - words = token_func(document) - else: - words = split_on_whitespace(document, new_line, tab) - return words
- - -
[docs]def words_refinement(words, - lower_case=False, - strip_chars=None, - use_words_aug=False, - words_aug_group_sizes=[2], - words_aug_join_char=''): - """ - Refine split words. Non reversible since the document is split on - multiple characters, words are stripped of special characters and - characters are converted to lower case. - - :param words: the word list to be augmented - :param lower_case: whether to convert word to lowercase - :param strip_chars: chars that need to be stripped in words - :param use_words_aug: whether to use word augmentation - :param words_aug_group_sizes: the size of word groups that need to - be merged - :param words_aug_join_char: characters to be added between word - group - :return: refined words or word list - """ - - if lower_case: - words = [word.lower() for word in words] - if strip_chars: - words = [strip(word, strip_chars) for word in words] - words = [word for word in words if word] - if use_words_aug: - augmentation = [ - words_augmentation(words, group_size, words_aug_join_char) - for group_size in words_aug_group_sizes - ] - augmentation = [word for augm in augmentation for word in augm] - words = words + augmentation - return words
- - -
[docs]def get_sentences_from_document(document, model_func=None): - """ - Get sentences from a document. - - :param document: document that need to split sentences - :param model_func: function of sentence model, if specified, the - function will be used for spliting document into different - sentences. - :return: document with the sentences separated by '\\\\n' - """ - if model_func: - sentences = model_func(document) - else: - sentences = document.splitlines() - return '\n'.join(sentences)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_deduplicator.html deleted file mode 100644 index 48c9a5165..000000000 --- a/_modules/data_juicer/ops/deduplicator/document_deduplicator.html +++ /dev/null @@ -1,221 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.document_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.document_deduplicator

-# Some code here has been modified from:
-# https://github.com/bigscience-workshop/data-preparation/blob/main/preprocessing/training/01a_catalogue_cleaning_and_filtering/clean_helpers/deduplication.py
-# --------------------------------------------------------
-
-import hashlib
-import string
-from collections import defaultdict
-from typing import Dict, Set
-
-import regex as re
-
-from data_juicer.utils.constant import HashKeys
-
-from ..base_op import OPERATORS, Deduplicator
-
-
-
[docs]@OPERATORS.register_module('document_deduplicator') -class DocumentDeduplicator(Deduplicator): - """ - Deduplicator to deduplicate samples at document-level using exact matching. - - Using md5 hash to deduplicate samples. - """ - -
[docs] def __init__(self, - lowercase: bool = False, - ignore_non_character: bool = False, - *args, - **kwargs): - """ - Initialization method. - - :param lowercase: Whether to convert sample text to lower case - :param ignore_non_character: Whether to ignore non-alphabet - characters, including whitespaces, digits, and punctuations - :param args: extra args - :param kwargs: extra args. - """ - super().__init__(*args, **kwargs) - self.lowercase = lowercase - self.remove_non_character_regex = re.compile( - f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605 - ) if ignore_non_character else None
- -
[docs] def compute_hash(self, sample): - """ - Compute md5 hash values for the sample. - - :param sample: input sample - :return: sample with md5 hash value. - """ - # check if it's computed already - if HashKeys.hash in sample: - return sample - - text = sample[self.text_key] - if self.lowercase: - text = text.lower() - if self.remove_non_character_regex: - text = self.remove_non_character_regex.sub('', text) - - def _get_hash(txt): - return hashlib.md5(txt.strip().encode('utf-8')).hexdigest() - - sample[HashKeys.hash] = _get_hash(text) - return sample
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - # no need to deduplicate because too few samples - if len(dataset) <= 1: - return dataset, {} - - dup_hashes = None - if show_num > 0: - # sample duplicate pairs - hash2ids: Dict[int, Set[int]] = defaultdict(set) - for sid, hash_val in enumerate(dataset[HashKeys.hash]): - hash2ids[hash_val].add(sid) - dup_samples = sorted(list(hash2ids.items()), - key=lambda x: len(x[1]), - reverse=True) - dup_hashes = set([ - item[0] for item in dup_samples if len(item[1]) > 1 - ][:show_num]) - - def _filter_dup_helper(sample, hashes): - hash = sample[HashKeys.hash] - if show_num > 0 and hash in dup_hashes \ - and len(dup_pairs[hash]) < 2: - # tracer is open and not enough duplicate sample pairs - dup_pairs[hash].append(sample) - if hash in hashes: - return False - else: - hashes.add(hash) - return True - - hashes = set() - dup_pairs = {hash_v: [] for hash_v in dup_hashes} if dup_hashes else {} - dataset = dataset.filter( - _filter_dup_helper, - fn_kwargs=dict(hashes=hashes), - load_from_cache_file=False if show_num > 0 else True) # num_proc=1 - return dataset, dup_pairs
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html deleted file mode 100644 index 8dab1040a..000000000 --- a/_modules/data_juicer/ops/deduplicator/document_minhash_deduplicator.html +++ /dev/null @@ -1,449 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.document_minhash_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.document_minhash_deduplicator

-# Some code here has been modified from:
-# https://github.com/bigcode-project/bigcode-dataset/blob/main/near_deduplication/minhash_deduplication.py
-# --------------------------------------------------------
-
-import hashlib
-import struct
-from collections import defaultdict
-from typing import Optional
-
-import numpy as np
-import regex
-from loguru import logger
-from pydantic import Field, PositiveInt
-from tqdm import tqdm
-from typing_extensions import Annotated
-
-from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import prepare_sentencepiece_model
-
-from ..base_op import OPERATORS, Deduplicator
-from ..common.helper_func import UnionFind, split_on_whitespace
-
-integrate = LazyLoader('integrate', 'scipy.integrate')
-
-OP_NAME = 'document_minhash_deduplicator'
-
-MERSENNE_PRIME = np.uint64((1 << 61) - 1)
-MAX_HASH = np.uint64((1 << 32) - 1)
-
-
-def sha1_hash32(data):
-    """
-    Directly taken from datasketch package to avoid dependency.
-
-    Parameters
-    ----------
-    data : bytes
-
-    Returns
-    -------
-    int
-    """
-    return struct.unpack('<I', hashlib.sha1(data).digest()[:4])[0]
-
-
-def optimal_param(
-    threshold: float,
-    num_perm: int,
-    false_positive_weight: float = 0.5,
-    false_negative_weight: float = 0.5,
-):
-    """
-    Compute the optimal `MinHashLSH` parameter that minimizes the weighted sum
-    of probabilities of false positive and false negative, taken from
-    datasketch.
-
-    :param threshold: float. The threshold for similarity
-    :param num_perm: int. The number of permutations
-    :param false_positive_weight: float. The weight of false positive
-    :param false_negative_weight: float. The weight of false negative
-    :return: Tuple[int, int]. The optimal `b` and `r` parameters. The number of
-        bands, and the number of rows per band respectively
-    """
-
-    def false_positive_probability(th: float, band: int, rows: int):
-        """Source: `datasketch.lsh`"""
-
-        def proba(s):
-            return 1 - (1 - s**float(rows))**float(band)
-
-        a, _ = integrate.quad(proba, 0.0, th)
-        return a
-
-    def false_negative_probability(th: float, band: int, rows: int):
-        """Source: `datasketch.lsh`"""
-
-        def proba(s):
-            return 1 - (1 - (1 - s**float(rows))**float(band))
-
-        a, _ = integrate.quad(proba, th, 1.0)
-        return a
-
-    # object: minimize the weighted FP and FN ratio
-    min_error = float('inf')
-    opt = (0, 0)
-    for b in range(1, num_perm + 1):
-        max_r = int(num_perm / b)
-        for r in range(1, max_r + 1):
-            fp = false_positive_probability(threshold, b, r)
-            fn = false_negative_probability(threshold, b, r)
-            error = fp * false_positive_weight + fn * false_negative_weight
-            if error < min_error:
-                min_error = error
-                opt = (b, r)
-    return opt
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class DocumentMinhashDeduplicator(Deduplicator): - """ - Deduplicator to deduplicate samples at document-level using MinHashLSH. - - Different from simhash, minhash is stored as bytes, so they won't be - kept in the final dataset. - """ - -
[docs] def __init__( - self, - tokenization: str = 'space', - window_size: PositiveInt = 5, - lowercase: bool = True, - ignore_pattern: Optional[str] = None, - num_permutations: PositiveInt = 256, - jaccard_threshold: Annotated[float, Field(ge=0, le=1)] = 0.7, - num_bands: Optional[PositiveInt] = None, - num_rows_per_band: Optional[PositiveInt] = None, - tokenizer_model: Optional[str] = None, - *args, - **kwargs, - ): - """ - Initialization method. - - :param tokenization: tokenization method for sample texts. It - should be one of [space, punctuation, character, - sentencepiece]. For English-like languages, we recommend - to use 'space', for Chinese-like languages, we recommend - to use 'character', and for multiple languages, we recommend - to use 'sentencepiece'. If using 'sentencepiece', please - provided the model path in the 'tokenizer_model' field. - :param window_size: window size of shingling - :param lowercase: whether to convert text to lower case first - :param ignore_pattern: whether to ignore sub-strings with - specific pattern when computing minhash - :param num_permutations: number of permutations in minhash - computing - :param jaccard_threshold: the min jaccard similarity threshold - in near-duplicate detection. When the jaccard similarity of - two sample texts is >= this threshold, they are regarded as - similar samples and this op will only keep one of them after - deduplication - :param num_bands: number of bands in LSH. Default it's None, and - it will be determined by an optimal params computation - algorithm by minimize the weighted sum of probs of False - Positives and False Negatives - :param num_rows_per_band: number of rows in each band in LSH. - Default it's None, and it will be determined by an optimal - params computation algorithm - :param tokenizer_model: path for the sentencepiece model, used for - sentencepiece tokenization. - """ - super().__init__(*args, **kwargs) - # about minhash computation - self.tokenization = tokenization - self.window_size = window_size - self.lowercase = lowercase - self.ignore_pattern = ignore_pattern - if self.ignore_pattern: - self.ignore_pattern = regex.compile(self.ignore_pattern) - - # check parameters - if self.ignore_pattern and self.tokenization == 'punctuation': - logger.warning('Be careful that tokenization with punctuations ' - 'won\'t work if the ignore pattern includes ' - 'punctuations.') - self.punctuation_pattern = regex.compile(r'\p{P}') - - if self.tokenization == 'sentencepiece': - if tokenizer_model is None: - raise ValueError("To use 'sentencepiece' tokenization, " - "'tokenizer_model' is required.") - self.tokenizer = prepare_sentencepiece_model(tokenizer_model) - else: - self.tokenizer = None - - # about deduplication - self.num_permutation = num_permutations - self.jaccard_threshold = jaccard_threshold - self.num_bands = num_bands - self.num_rows_per_band = num_rows_per_band - - # initialize deduplication parameters - # check number of bands and rows - if self.num_bands is None or self.num_rows_per_band is None: - self.num_bands, self.num_rows_per_band = optimal_param( - self.jaccard_threshold, - self.num_permutation, - ) - - # compute hash ranges and create hash tables - self.hash_ranges = [(i * self.num_rows_per_band, - (i + 1) * self.num_rows_per_band) - for i in range(self.num_bands)] - self.hash_tables = [defaultdict(set) for _ in range(self.num_bands)] - - # generate permutations - gen = np.random.RandomState(seed=42) - self.perm_a, self.perm_b = np.array( - [( - gen.randint(1, MERSENNE_PRIME, dtype=np.uint64), - gen.randint(0, MERSENNE_PRIME, dtype=np.uint64), - ) for _ in range(self.num_permutation)], - dtype=np.uint64, - ).T
- -
[docs] def compute_hash(self, sample): - """ - Compute minhash values for the sample. - - :param sample: input sample - :return: sample with minhash value. - """ - # check if it's computed already - if HashKeys.minhash in sample: - return sample - - text = sample[self.text_key] - - if self.lowercase: - text = text.lower() - if self.ignore_pattern: - text = self.ignore_pattern.sub('', text) - - # get tokens for different tokenization method - tokens = set() - if self.tokenization == 'character': - tokens = { - str.encode(text[i:i + self.window_size]) - for i in range(len(text) - self.window_size) - } - elif self.tokenization == 'punctuation': - tokens = self.punctuation_pattern.split(text) - tokens = { - str.encode(' '.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - } - elif self.tokenization == 'space': - tokens = split_on_whitespace(text) - tokens = { - str.encode(' '.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - } - elif self.tokenization == 'sentencepiece': - tokens = self.tokenizer.encode(text, out_type=str) - tokens = { - str.encode(''.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - } - else: - raise NotImplementedError( - f'Unimplemented tokenization method [{self.tokenization}]') - - # compute minhash value - hv = np.array([sha1_hash32(token) for token in tokens], - dtype=np.uint64) - phv = np.bitwise_and( - ((hv * np.tile(self.perm_a, - (len(hv), 1)).T).T + self.perm_b) % MERSENNE_PRIME, - MAX_HASH) - hash_values = np.vstack([ - phv, - np.ones(self.num_permutation, dtype=np.uint64) * MAX_HASH - ]).min(axis=0) - sample[HashKeys.minhash] = [ - bytes(hash_values[start:end].byteswap().data) - for start, end in self.hash_ranges - ] - return sample
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - # no need to deduplicate because too few samples - if len(dataset) <= 1: - return dataset, {} - - minhashes = dataset[HashKeys.minhash] - # remove bytes minhash column otherwise unexpected error would occur - # when exporting the processed dataset - dataset = dataset.remove_columns([HashKeys.minhash]) - - # make clusters -- construct the minhash lookup tables of seg to ids - logger.info(f'Start clustering for {len(dataset)} samples...') - batch_size = 10000 - for i in tqdm(range(0, len(minhashes), batch_size), - dynamic_ncols=True, - desc='Iterating MinHashes of samples...'): - batch = minhashes[i:i + batch_size] - for idx, hs in enumerate(batch): - for h, hashtable in zip(hs, self.hash_tables): - hashtable[h].add(idx + i) - - # using UnionFind set to union samples within the same clusters - union_find = UnionFind() - for table in tqdm(self.hash_tables, - dynamic_ncols=True, - desc='Clustering'): - for cluster in table.values(): - if len(cluster) <= 1: - continue - idx = min(cluster) - for x in cluster: - union_find.union(x, idx) - logger.info(f'There are {len(set(union_find.parent.values()))} ' - f'clusters that includes multiple near-duplicate samples.') - - # record the duplicate sample pairs - dup_pairs = {} - if show_num > 0: - for i in range(len(dataset)): - cluster_idx = union_find.find(i) - if cluster_idx not in dup_pairs and cluster_idx != i: - dup_pairs[cluster_idx] = [ - dataset[cluster_idx], - dataset[i], - ] - if len(dup_pairs) >= show_num: - break - - # filtering -- only keep those samples whose parent index is itself, - # including: - # 1. samples that form a cluster by themselves - # 2. the first sample in a cluster that includes multiple samples - def _filter_minhash_dup_helper(sample, index): - return union_find.find(index) == index - - dataset = dataset.filter( - _filter_minhash_dup_helper, - with_indices=True, - ) - logger.info(f'Keep {len(dataset)} samples after MinHash dedup.') - - return dataset, dup_pairs
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html b/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html deleted file mode 100644 index bc5ec9fc7..000000000 --- a/_modules/data_juicer/ops/deduplicator/document_simhash_deduplicator.html +++ /dev/null @@ -1,335 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.document_simhash_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.document_simhash_deduplicator

-# Some code here has been modified from:
-# https://github.com/bigscience-workshop/data-preparation
-# --------------------------------------------------------
-
-from collections import defaultdict, deque
-from typing import Dict, Optional, Set
-
-import numpy as np
-import regex
-from loguru import logger
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-
-from ..base_op import OPERATORS, Deduplicator
-from ..common.helper_func import split_on_whitespace
-
-simhash = LazyLoader('simhash', 'simhash')
-
-OP_NAME = 'document_simhash_deduplicator'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class DocumentSimhashDeduplicator(Deduplicator): - """Deduplicator to deduplicate samples at document-level using SimHash.""" - -
[docs] def __init__(self, - tokenization: str = 'space', - window_size: PositiveInt = 6, - lowercase: bool = True, - ignore_pattern: Optional[str] = None, - num_blocks: PositiveInt = 6, - hamming_distance: PositiveInt = 4, - *args, - **kwargs): - """ - Initialization method :param tokenization: tokenization method for - sample texts. - - It should be one of [space, punctuation, character]. For - English-like languages, we recommend to use 'space'. And for - Chinese-like languages, we recommend to use 'character' - - :param window_size: window size of shingling - :param lowercase: whether to convert text to lower case first - :param ignore_pattern: whether to ignore sub-strings with - specific pattern when computing simhash - :param num_blocks: number of blocks in simhash computing - :param hamming_distance: the max hamming distance threshold in - near-duplicate detection. When the hamming distance of two - sample texts is <= this threshold, they are regarded as - similar samples and this op will only keep one of them after - deduplication. This threshold should be always less than - num_blocks - """ - # about simhash computation - super().__init__(*args, **kwargs) - self.tokenization = tokenization - self.window_size = window_size - self.lowercase = lowercase - self.ignore_pattern = ignore_pattern - if self.ignore_pattern: - self.ignore_pattern = regex.compile(self.ignore_pattern) - - # check parameters - if self.ignore_pattern and self.tokenization == 'punctuation': - logger.warning('Be careful that tokenization with punctuations ' - 'won\'t work if the ignore pattern includes ' - 'punctuations.') - self.punctuation_pattern = regex.compile(r'\p{P}') - - # about deduplication - self.num_blocks = num_blocks - self.hamming_distance = hamming_distance
- -
[docs] def compute_hash(self, sample): - """ - Compute simhash values for the sample. - - :param sample: input sample - :return: sample with simhash value. - """ - # check if it's computed already - if HashKeys.simhash in sample: - return sample - - text = sample[self.text_key] - - if self.lowercase: - text = text.lower() - if self.ignore_pattern: - text = self.ignore_pattern.sub('', text) - - # get tokens for different tokenization method - tokens = [] - if self.tokenization == 'character': - tokens = [ - str.encode(text[i:i + self.window_size]) - for i in range(len(text) - self.window_size) - ] - elif self.tokenization == 'punctuation': - tokens = self.punctuation_pattern.split(text) - tokens = [ - str.encode(' '.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - ] - elif self.tokenization == 'space': - tokens = split_on_whitespace(text) - tokens = [ - str.encode(' '.join(tokens[i:i + self.window_size])) - for i in range(len(tokens) - self.window_size) - ] - else: - raise NotImplementedError( - f'Unimplemented tokenization method [{self.tokenization}]') - - # compute simhash - sample[HashKeys.simhash] = str( - np.uint64(simhash.compute(map(simhash.unsigned_hash, tokens)))) - return sample
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - # no need to deduplicate because too few samples - if len(dataset) <= 1: - return dataset, {} - - # find matches - logger.info(f'Start querying {len(dataset)} samples.') - matches = simhash.find_all( - np.uint64(dataset[HashKeys.simhash]), - self.num_blocks, - self.hamming_distance, - ) - logger.info(f'Querying done, found {len(matches)} matches.') - - # compute hash diff distribution - graph = defaultdict(dict) - for x, y in matches: - x = str(x) - y = str(y) - graph[x][y] = graph[y][x] = True - - hash2ids: Dict[str, Set[str]] = defaultdict(set) - hashes: Set[str] = set(dataset[HashKeys.simhash]) - hash2cluster: Dict[str, int] = {} - visited: Set[str] = set() - cluster_id: int = 0 - - for sid, hash_val in enumerate(dataset[HashKeys.simhash]): - hash2ids[hash_val].add(str(sid)) - - # clustering - dup_pairs = {} # store duplicate pairs when show_num > 0 - while hashes: - hash_val = hashes.pop() - if hash_val in visited: - continue - - # if this hash value is not in the matches list, it's regarded as a - # single cluster - if hash_val not in graph: - continue - - # Otherwise, BFS to find the cluster - q = deque([hash_val]) - visited.add(hash_val) - hash2cluster[hash_val] = cluster_id - if show_num > 0 and len(dup_pairs) < show_num: - dup_pairs[cluster_id] = [] - - while q: - curr = q.popleft() - for neighbor in graph[curr]: - if neighbor in visited: - continue - visited.add(neighbor) - q.append(neighbor) - hash2cluster[neighbor] = cluster_id - - cluster_id += 1 - logger.info(f'Found {cluster_id} clusters and {len(graph)} hashes.') - - # filter duplicated samples - # NOTICE: For now, we only keep the first sample in a cluster. Maybe - # there are some better strategies later. - def _filter_simhash_dup_helper(sample, visited_clusters, - visited_hashes): - sample_hash_val = sample[HashKeys.simhash] - if sample_hash_val not in hash2cluster: - # single-sample cluster, we need to check hash value still. - if sample_hash_val in visited_hashes: - return False - else: - visited_hashes.add(sample_hash_val) - return True - else: - cluster_num = hash2cluster[sample_hash_val] - if show_num > 0 and cluster_num in dup_pairs \ - and len(dup_pairs[cluster_num]) < 2: - dup_pairs[cluster_num].append(sample) - # regular cluster, check cluster number. - if cluster_num in visited_clusters: - return False - else: - visited_clusters.add(cluster_num) - return True - - cluster_record = set() - hash_record = set() - dataset = dataset.filter( - _filter_simhash_dup_helper, - fn_kwargs=dict(visited_clusters=cluster_record, - visited_hashes=hash_record), - load_from_cache_file=False if show_num > 0 else True) - logger.info(f'Keep {len(dataset)} samples after SimHash dedup.') - - return dataset, dup_pairs
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/image_deduplicator.html b/_modules/data_juicer/ops/deduplicator/image_deduplicator.html deleted file mode 100644 index aee856b8e..000000000 --- a/_modules/data_juicer/ops/deduplicator/image_deduplicator.html +++ /dev/null @@ -1,254 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.image_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.image_deduplicator

-from collections import defaultdict
-from typing import Dict, Set, Tuple
-
-import numpy as np
-
-from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-
-from ..base_op import OPERATORS, Deduplicator
-from ..op_fusion import LOADED_IMAGES
-from .document_deduplicator import DocumentDeduplicator
-
-imgdedup_methods = LazyLoader('imgdedup_methods', 'imagededup.methods')
-
-OP_NAME = 'image_deduplicator'
-
-HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
-
-
-def get_hash_method(method_name):
-
-    mapping = {
-        'phash': imgdedup_methods.PHash,
-        'dhash': imgdedup_methods.DHash,
-        'whash': imgdedup_methods.WHash,
-        'ahash': imgdedup_methods.AHash
-    }
-
-    return mapping[method_name]
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageDeduplicator(Deduplicator): - """ - Deduplicator to deduplicate samples at document-level using exact matching - of images between documents. - """ - -
[docs] def __init__(self, - method: str = 'phash', - consider_text: bool = False, - *args, - **kwargs): - """ - Initialization method. - - :param method: hash method for image - :param consider_text: whether to consider text hash together with image - hash when applying deduplication. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if method not in HASH_METHOD: - raise ValueError(f'Keep strategy [{method}] is not supported. ' - f'Can only be one of {HASH_METHOD}.') - self.hasher = get_hash_method(method)() - self.consider_text = consider_text - self.text_dedup_op = None - if self.consider_text: - self.text_dedup_op = DocumentDeduplicator(**kwargs)
- -
[docs] def compute_hash(self, sample, context=False): - # get hash of text first - if self.consider_text: - sample = self.text_dedup_op.compute_hash(sample) - # check if it's computed already - if HashKeys.imagehash in sample: - return sample - - # there is no image in this sample - sample[HashKeys.imagehash] = '' - if self.image_key not in sample or not sample[self.image_key]: - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - # compute hash - for key in images: - sample[HashKeys.imagehash] += self.hasher.encode_image( - image_array=np.array(images[key])) - return sample
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - # no need to deduplicate because too few samples - if len(dataset) <= 1: - return dataset, {} - - dup_hashes = None - if show_num > 0: - # sample duplicate pairs - if self.consider_text: - hash2ids: Dict[Tuple[int, int], Set[int]] = defaultdict(set) - hashes = zip(dataset[HashKeys.imagehash], - dataset[HashKeys.hash]) - else: - hash2ids: Dict[int, Set[int]] = defaultdict(set) - hashes = dataset[HashKeys.imagehash] - for sid, hash_val in enumerate(hashes): - if hash_val: - hash2ids[hash_val].add(sid) - dup_samples = sorted(list(hash2ids.items()), - key=lambda x: len(x[1]), - reverse=True) - dup_hashes = set([ - item[0] for item in dup_samples if len(item[1]) > 1 - ][:show_num]) - - def _filter_dup_helper(sample, hashes): - if self.consider_text: - hash = (sample[HashKeys.imagehash], sample[HashKeys.hash]) - else: - hash = sample[HashKeys.imagehash] - if not hash: - return True - if show_num > 0 and hash in dup_hashes \ - and len(dup_pairs[hash]) < 2: - # tracer is open and not enough duplicate sample pairs - dup_pairs[hash].append(sample) - if hash in hashes: - return False - else: - hashes.add(hash) - return True - - hashes = set() - dup_pairs = {hash_v: [] for hash_v in dup_hashes} if dup_hashes else {} - dataset = dataset.filter( - _filter_dup_helper, - fn_kwargs=dict(hashes=hashes), - load_from_cache_file=False if show_num > 0 else True) # num_proc=1 - return dataset, dup_pairs
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html deleted file mode 100644 index 675cf3a97..000000000 --- a/_modules/data_juicer/ops/deduplicator/ray_basic_deduplicator.html +++ /dev/null @@ -1,164 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.ray_basic_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.ray_basic_deduplicator

-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-
-from ..base_op import Filter
-
-redis = LazyLoader('redis', 'redis')
-
-
-
[docs]class RayBasicDeduplicator(Filter): - """ - A basic exact matching deduplicator for RAY. - Although its functionality is deduplication, - it is implemented as Filter sub-class. - """ - - # TODO: Set a more reasonable value - EMPTY_HASH_VALUE = 'EMPTY' - -
[docs] def __init__(self, - redis_host: str = 'localhost', - redis_port: PositiveInt = 6380, - *args, - **kwargs): - """ - Initialization. - :param redis_host: the hostname of redis server - :param redis_port: the port of redis server - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.redis_host = redis_host - self.redis_port = redis_port - # TODO: add a barrier to ensure that flushdb is performed before - # the operator is called - r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0) - r.flushdb(0)
- -
[docs] def calculate_hash(self, sample, context=False): - """Calculate hash value for the sample.""" - raise NotImplementedError
- -
[docs] def compute_stats_single(self, sample, context=False): - # init redis client - r = redis.StrictRedis(host=self.redis_host, port=self.redis_port, db=0) - # compute hash - md5_value = self.calculate_hash(sample, context) - # check existing - sample[HashKeys.is_duplicate] = r.setnx(md5_value, 1) - return sample
- -
[docs] def process_single(self, sample): - return sample[HashKeys.is_duplicate]
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html deleted file mode 100644 index b90db6d27..000000000 --- a/_modules/data_juicer/ops/deduplicator/ray_document_deduplicator.html +++ /dev/null @@ -1,164 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.ray_document_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.ray_document_deduplicator

-import hashlib
-import string
-
-import regex as re
-from pydantic import PositiveInt
-
-from ..base_op import OPERATORS
-from .ray_basic_deduplicator import RayBasicDeduplicator
-
-OP_NAME = 'ray_document_deduplicator'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class RayDocumentDeduplicator(RayBasicDeduplicator): - """ - Deduplicator to deduplicate samples at document-level using exact matching. - """ - -
[docs] def __init__(self, - redis_host: str = 'localhost', - redis_port: PositiveInt = 6380, - lowercase: bool = False, - ignore_non_character: bool = False, - *args, - **kwargs): - """ - Initialization method. - :param redis_host: the hostname of redis server - :param redis_port: the port of redis server - :param lowercase: Whether to convert sample text to lower case - :param ignore_non_character: Whether to ignore non-alphabet - characters, including whitespaces, digits, and punctuations - :param args: extra args - :param kwargs: extra args. - """ - super().__init__(redis_host=redis_host, - redis_port=redis_port, - *args, - **kwargs) - self.lowercase = lowercase - self.remove_non_character_regex = re.compile( - f'\s+|\d+|[{re.escape(string.punctuation)}]' # noqa: W605 - ) if ignore_non_character else None
- -
[docs] def calculate_hash(self, sample, context=False): - if self.text_key not in sample or not sample[self.text_key]: - return RayBasicDeduplicator.EMPTY_HASH_VALUE - - text = sample[self.text_key] - if self.lowercase: - text = text.lower() - if self.remove_non_character_regex: - text = self.remove_non_character_regex.sub('', text) - - return hashlib.md5(text.strip().encode('utf-8')).hexdigest()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html deleted file mode 100644 index 7f3996efa..000000000 --- a/_modules/data_juicer/ops/deduplicator/ray_image_deduplicator.html +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.ray_image_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.ray_image_deduplicator

-import numpy as np
-from pydantic import PositiveInt
-
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-
-from ..base_op import OPERATORS
-from ..op_fusion import LOADED_IMAGES
-from .ray_basic_deduplicator import RayBasicDeduplicator
-
-imgdedup_methods = LazyLoader('imgdedup_methods', 'imagededup.methods')
-
-OP_NAME = 'ray_image_deduplicator'
-
-HASH_METHOD = {'phash', 'dhash', 'whash', 'ahash'}
-
-
-def get_hash_method(method_name):
-
-    mapping = {
-        'phash': imgdedup_methods.PHash,
-        'dhash': imgdedup_methods.DHash,
-        'whash': imgdedup_methods.WHash,
-        'ahash': imgdedup_methods.AHash
-    }
-
-    return mapping[method_name]
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class RayImageDeduplicator(RayBasicDeduplicator): - """ - Deduplicator to deduplicate samples at document-level using exact matching - of images between documents. - """ - -
[docs] def __init__(self, - redis_host: str = 'localhost', - redis_port: PositiveInt = 6380, - method: str = 'phash', - *args, - **kwargs): - """ - Initialization. - :param redis_host: the hostname of redis server - :param redis_port: the port of redis server - :param args: extra args - :param kwargs: extra args - """ - super().__init__(redis_host=redis_host, - redis_port=redis_port, - *args, - **kwargs) - if method not in HASH_METHOD: - raise ValueError(f'Keep strategy [{method}] is not supported. ' - f'Can only be one of {HASH_METHOD}.') - self.hasher = get_hash_method(method)()
- -
[docs] def calculate_hash(self, sample, context=False): - if self.image_key not in sample or not sample[self.image_key]: - return RayBasicDeduplicator.EMPTY_HASH_VALUE - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - # compute hash - hash_value = '' - for key in images: - hash_value += self.hasher.encode_image( - image_array=np.array(images[key])) - - return hash_value
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html deleted file mode 100644 index 4465db863..000000000 --- a/_modules/data_juicer/ops/deduplicator/ray_video_deduplicator.html +++ /dev/null @@ -1,168 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.ray_video_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.ray_video_deduplicator

-import hashlib
-
-from pydantic import PositiveInt
-
-from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
-                                        load_video)
-
-from ..base_op import OPERATORS
-from ..op_fusion import LOADED_VIDEOS
-from .ray_basic_deduplicator import RayBasicDeduplicator
-
-OP_NAME = 'ray_video_deduplicator'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class RayVideoDeduplicator(RayBasicDeduplicator): - """ - Deduplicator to deduplicate samples at document-level using exact matching - of videos between documents. - """ - -
[docs] def __init__(self, - redis_host: str = 'localhost', - redis_port: PositiveInt = 6380, - *args, - **kwargs): - """ - Initialization. - :param redis_host: the hostname of redis server - :param redis_port: the port of redis server - :param args: extra args - :param kwargs: extra args - """ - super().__init__(redis_host=redis_host, - redis_port=redis_port, - *args, - **kwargs)
- -
[docs] def calculate_hash(self, sample, context=False): - if self.video_key not in sample or not sample[self.video_key]: - return RayBasicDeduplicator.EMPTY_HASH_VALUE - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - # compute hash - md5_hash = hashlib.md5() - for key in videos: - # consider the multi stream of video in one container - for packet in videos[key].demux(): - if packet.stream.type == 'video': - md5_hash.update(bytes(packet)) - - for key in videos: - close_video(videos[key]) - - return md5_hash.hexdigest()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html b/_modules/data_juicer/ops/deduplicator/video_deduplicator.html deleted file mode 100644 index 3cd615c39..000000000 --- a/_modules/data_juicer/ops/deduplicator/video_deduplicator.html +++ /dev/null @@ -1,236 +0,0 @@ - - - - - - - - data_juicer.ops.deduplicator.video_deduplicator — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.deduplicator.video_deduplicator

-import hashlib
-from collections import defaultdict
-from typing import Dict, Set, Tuple
-
-from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
-                                        load_video)
-
-from ..base_op import OPERATORS, Deduplicator
-from ..op_fusion import LOADED_VIDEOS
-from .document_deduplicator import DocumentDeduplicator
-
-OP_NAME = 'video_deduplicator'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoDeduplicator(Deduplicator): - """ - Deduplicator to deduplicate samples at document-level using exact matching - of videos between documents. - """ - -
[docs] def __init__(self, consider_text: bool = False, *args, **kwargs): - """ - Initialization. - - :param consider_text: whether to consider text hash together with video - hash when applying deduplication. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.consider_text = consider_text - self.text_dedup_op = None - if self.consider_text: - self.text_dedup_op = DocumentDeduplicator(**kwargs)
- -
[docs] def compute_hash(self, sample, context=False): - # get hash of text first - if self.consider_text: - sample = self.text_dedup_op.compute_hash(sample) - # check if it's computed already - if HashKeys.videohash in sample: - return sample - - # there is no video in this sample - sample[HashKeys.videohash] = '' - if self.video_key not in sample or not sample[self.video_key]: - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - # compute hash - md5_hash = hashlib.md5() - for key in videos: - # consider the multi stream of video in one container - for packet in videos[key].demux(): - if packet.stream.type == 'video': - md5_hash.update(bytes(packet)) - - for key in videos: - close_video(videos[key]) - - sample[HashKeys.videohash] = md5_hash.hexdigest() - return sample
- -
[docs] def process(self, dataset, show_num=0): - """ - For doc-level, dataset --> dataset. - - :param dataset: input dataset - :param show_num: number of traced samples used when tracer is - open. - :return: deduplicated dataset and the sampled duplicate pairs. - """ - # no need to deduplicate because too few samples - if len(dataset) <= 1: - return dataset, {} - - dup_hashes = None - if show_num > 0: - # sample duplicate pairs - if self.consider_text: - hash2ids: Dict[Tuple[int, int], Set[int]] = defaultdict(set) - hashes = zip(dataset[HashKeys.videohash], - dataset[HashKeys.hash]) - else: - hash2ids: Dict[int, Set[int]] = defaultdict(set) - hashes = dataset[HashKeys.videohash] - for sid, hash_val in enumerate(hashes): - if hash_val: - hash2ids[hash_val].add(sid) - dup_samples = sorted(list(hash2ids.items()), - key=lambda x: len(x[1]), - reverse=True) - dup_hashes = set([ - item[0] for item in dup_samples if len(item[1]) > 1 - ][:show_num]) - - def _filter_dup_helper(sample, hashes): - if self.consider_text: - hash = (sample[HashKeys.videohash], sample[HashKeys.hash]) - else: - hash = sample[HashKeys.videohash] - if not hash: - return True - if show_num > 0 and hash in dup_hashes \ - and len(dup_pairs[hash]) < 2: - # tracer is open and not enough duplicate sample pairs - dup_pairs[hash].append(sample) - if hash in hashes: - return False - else: - hashes.add(hash) - return True - - hashes = set() - dup_pairs = {hash_v: [] for hash_v in dup_hashes} if dup_hashes else {} - dataset = dataset.filter( - _filter_dup_helper, - fn_kwargs=dict(hashes=hashes), - load_from_cache_file=False if show_num > 0 else True) # num_proc=1 - return dataset, dup_pairs
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/alphanumeric_filter.html b/_modules/data_juicer/ops/filter/alphanumeric_filter.html deleted file mode 100644 index 9c304f8ce..000000000 --- a/_modules/data_juicer/ops/filter/alphanumeric_filter.html +++ /dev/null @@ -1,203 +0,0 @@ - - - - - - - - data_juicer.ops.filter.alphanumeric_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.alphanumeric_filter

-import sys
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..common import get_words_from_document
-
-OP_NAME = 'alphanumeric_filter'
-
-
-
[docs]@OPERATORS.register_module('alphanumeric_filter') -class AlphanumericFilter(Filter): - """Filter to keep samples with alphabet/numeric ratio within a specific - range.""" - - _batched_op = True - -
[docs] def __init__(self, - tokenization: bool = False, - min_ratio: float = 0.25, - max_ratio: float = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param tokenization: Whether to count the ratio of alphanumeric - to the total number of tokens. if tokenization=False, it - will count the ratio of alphanumeric to the total number of - characters. - :param min_ratio: The min filter ratio in alphanumeric op, - samples will be filtered if their alphabet/numeric ratio is - below this parameter. - :param max_ratio: The max filter ratio in alphanumeric op, - samples will be filtered if their alphabet/numeric ratio - exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.tokenization = tokenization - self.min_ratio = min_ratio - self.max_ratio = max_ratio - self.model_key = None - - if tokenization: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path='EleutherAI/pythia-6.9b-deduped', - return_model=False)
- -
[docs] def compute_stats_batched(self, samples): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - - for idx, stat in enumerate(samples_stats): - cur_text = samples_list[idx] - if self.tokenization: - if StatsKeys.alpha_token_ratio in stat: - continue - alpha_count = sum( - map(lambda char: 1 if char.isalpha() else 0, cur_text)) - tokenizer = get_model(self.model_key) - token_count = len( - get_words_from_document( - cur_text, - token_func=tokenizer.tokenize if tokenizer else None)) - samples_stats[idx][StatsKeys.alpha_token_ratio] = ( - alpha_count / token_count) if token_count != 0 else 0.0 - else: - if StatsKeys.alnum_ratio in stat: - continue - alnum_count = sum( - map(lambda char: 1 if char.isalnum() else 0, cur_text)) - samples_stats[idx][StatsKeys.alnum_ratio] = ( - alnum_count / len(cur_text)) if len(cur_text) != 0 else 0.0 - - return samples
- -
[docs] def process_batched(self, samples): - ratio_key = StatsKeys.alpha_token_ratio if self.tokenization \ - else StatsKeys.alnum_ratio - if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_ratio <= stat[ratio_key] <= self. - max_ratio, samples[Fields.stats]) - else: - # single sample for ray filter - if self.min_ratio <= samples[ - Fields.stats][ratio_key] <= self.max_ratio: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/audio_duration_filter.html b/_modules/data_juicer/ops/filter/audio_duration_filter.html deleted file mode 100644 index 782de8051..000000000 --- a/_modules/data_juicer/ops/filter/audio_duration_filter.html +++ /dev/null @@ -1,199 +0,0 @@ - - - - - - - - data_juicer.ops.filter.audio_duration_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.audio_duration_filter

-import sys
-
-import librosa
-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import load_audio, load_data_with_context
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_AUDIOS
-
-OP_NAME = 'audio_duration_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_AUDIOS.register_module(OP_NAME) -class AudioDurationFilter(Filter): - """Keep data samples whose audios' durations are within a specified range. - """ - -
[docs] def __init__(self, - min_duration: int = 0, - max_duration: int = sys.maxsize, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_duration: The min audio duration to keep samples in seconds. - It's 0 by default. - :param max_duration: The max audio duration to keep samples in seconds. - It's sys.maxsize by default. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all audios. 'any': keep this sample if any audios meet the - condition. 'all': keep this sample only if all audios meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_duration = min_duration - self.max_duration = max_duration - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.audio_duration in sample[Fields.stats]: - return sample - - # there is no audio in this sample - if self.audio_key not in sample or not sample[self.audio_key]: - sample[Fields.stats][StatsKeys.audio_duration] = np.array( - [], dtype=np.float64) - return sample - - # load audios - loaded_audio_keys = sample[self.audio_key] - sample, audios = load_data_with_context(sample, context, - loaded_audio_keys, load_audio) - - audio_durations = { - audio_key: librosa.get_duration(y=audio[0], sr=audio[1]) - for audio_key, audio in audios.items() - } - - # get audio durations - sample[Fields.stats][StatsKeys.audio_duration] = [ - audio_durations[audio_key] for audio_key in sample[self.audio_key] - ] - - return sample
- -
[docs] def process_single(self, sample): - audio_durations = sample[Fields.stats][StatsKeys.audio_duration] - keep_bools = np.array([ - self.min_duration <= duration <= self.max_duration - for duration in audio_durations - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html b/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html deleted file mode 100644 index 7fdd06324..000000000 --- a/_modules/data_juicer/ops/filter/audio_nmf_snr_filter.html +++ /dev/null @@ -1,247 +0,0 @@ - - - - - - - - data_juicer.ops.filter.audio_nmf_snr_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.audio_nmf_snr_filter

-import sys
-
-import librosa
-import numpy as np
-from librosa.decompose import decompose
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import load_audio, load_data_with_context
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_AUDIOS
-
-OP_NAME = 'audio_nmf_snr_filter'
-
-
-# run NMF to decompose the signal and noise from the input audio
-def separate_signal_noise(audio, n_components=2, nmf_iter=500):
-    # convert spectral domain using Short-time Fourier transform
-    S = np.abs(librosa.stft(audio))
-
-    # run NMF to decompose the audio
-    W, H = decompose(S,
-                     n_components=n_components,
-                     init='random',
-                     random_state=0,
-                     max_iter=nmf_iter)
-
-    # get signal and noise
-    signal = np.dot(W[:, 0:1], H[0:1, :])
-    noise = np.dot(W[:, 1:2], H[1:2, :])
-
-    # convert back to time domain
-    signal_audio = librosa.istft(signal * np.exp(1j * np.angle(S)))
-    noise_audio = librosa.istft(noise * np.exp(1j * np.angle(S)))
-
-    return signal_audio, noise_audio
-
-
-# compute the SNR of an audio with NMF algorithm
-def compute_nmf_snr(audio_data, nmf_iter=500):
-    # separate the signal and noise parts from the original audio
-    signal, noise = separate_signal_noise(audio_data,
-                                          n_components=2,
-                                          nmf_iter=nmf_iter)
-
-    # compute the power of signal and noise
-    power_signal = np.mean(signal**2)
-    power_noise = np.mean(noise**2)
-
-    # compute SNR in dB
-    if power_noise == 0:
-        snr = np.finfo(np.float64).max
-    else:
-        snr = 10 * np.log10(power_signal / power_noise)
-
-    return snr
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_AUDIOS.register_module(OP_NAME) -class AudioNMFSNRFilter(Filter): - """Keep data samples whose audios' SNRs (computed based on NMF) are within - a specified range. - """ - -
[docs] def __init__(self, - min_snr: float = 0, - max_snr: float = sys.maxsize, - nmf_iter_num: PositiveInt = 500, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_snr: The min audio SNR to keep samples in dB. It's 0 by - default. - :param max_snr: The max audio SNR to keep samples in dB. It's - sys.maxsize by default. - :param nmf_iter_num: The max number of iterations to run NMF. It's 500 - in default. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all audios. 'any': keep this sample if any audios meet the - condition. 'all': keep this sample only if all audios meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_snr = min_snr - self.max_snr = max_snr - self.nmf_iter_num = nmf_iter_num - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.audio_nmf_snr in sample[Fields.stats]: - return sample - - # there is no audio in this sample - if self.audio_key not in sample or not sample[self.audio_key]: - sample[Fields.stats][StatsKeys.audio_nmf_snr] = np.array( - [], dtype=np.float64) - return sample - - # load audios - loaded_audio_keys = sample[self.audio_key] - sample, audios = load_data_with_context(sample, context, - loaded_audio_keys, load_audio) - - audio_snrs = { - audio_key: compute_nmf_snr(audio[0], self.nmf_iter_num) - for audio_key, audio in audios.items() - } - - # get audio SNRs - sample[Fields.stats][StatsKeys.audio_nmf_snr] = [ - audio_snrs[audio_key] for audio_key in sample[self.audio_key] - ] - - return sample
- -
[docs] def process_single(self, sample): - audio_snrs = sample[Fields.stats][StatsKeys.audio_nmf_snr] - keep_bools = np.array( - [self.min_snr <= snr <= self.max_snr for snr in audio_snrs]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/audio_size_filter.html b/_modules/data_juicer/ops/filter/audio_size_filter.html deleted file mode 100644 index 229219ca1..000000000 --- a/_modules/data_juicer/ops/filter/audio_size_filter.html +++ /dev/null @@ -1,183 +0,0 @@ - - - - - - - - data_juicer.ops.filter.audio_size_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.audio_size_filter

-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import get_file_size, size_to_bytes
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('audio_size_filter') -class AudioSizeFilter(Filter): - """Keep data samples whose audio size (in bytes/kb/MB/...) within a - specific range. - """ - -
[docs] def __init__(self, - min_size: str = '0', - max_size: str = '1TB', - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_size: The min audio size to keep samples. set to be "0" by - default for no size constraint - :param max_size: The max audio size to keep samples. set to be - "1Tb" by default, an approximate for un-limited case - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all audios. 'any': keep this sample if any audios meet the - condition. 'all': keep this sample only if all audios meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_size = size_to_bytes(min_size) - self.max_size = size_to_bytes(max_size) - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.audio_sizes in sample[Fields.stats]: - return sample - - # there is no audio in this sample - if self.audio_key not in sample or not sample[self.audio_key]: - sample[Fields.stats][StatsKeys.audio_sizes] = np.array( - [], dtype=np.float64) - return sample - - # for size calculation, no need to load audios into memory - sample[Fields.stats][StatsKeys.audio_sizes] = [ - get_file_size(aud_path) for aud_path in sample[self.audio_key] - ] - - return sample
- -
[docs] def process_single(self, sample): - audio_sizes = sample[Fields.stats][StatsKeys.audio_sizes] - keep_bools = np.array([ - self.min_size <= audio_size <= self.max_size - for audio_size in audio_sizes - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/average_line_length_filter.html b/_modules/data_juicer/ops/filter/average_line_length_filter.html deleted file mode 100644 index 70e79e6ae..000000000 --- a/_modules/data_juicer/ops/filter/average_line_length_filter.html +++ /dev/null @@ -1,181 +0,0 @@ - - - - - - - - data_juicer.ops.filter.average_line_length_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.average_line_length_filter

-import sys
-
-from data_juicer.utils.constant import Fields, InterVars, StatsKeys
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_LINES
-
-OP_NAME = 'average_line_length_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@INTER_LINES.register_module(OP_NAME) -class AverageLineLengthFilter(Filter): - """Filter to keep samples with average line length within a specific - range.""" - - _batched_op = True - -
[docs] def __init__(self, - min_len: int = 10, - max_len: int = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param min_len: The min filter length in this op, samples will - be filtered if their average line length is below this - parameter. - :param max_len: The max filter length in this op, samples will - be filtered if their average line length exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_len = min_len - self.max_len = max_len
- -
[docs] def compute_stats_batched(self, samples, context=False): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - context_key = f'{InterVars.lines}' - - for idx, stat in enumerate(samples_stats): - # check if it's computed already - if StatsKeys.avg_line_length in stat: - continue - - cur_text = samples_list[idx] - if context and context_key in samples[Fields.context][idx]: - lines = samples[Fields.context][idx][context_key] - else: - lines = cur_text.splitlines() - if context: - samples[Fields.context][idx][context_key] = lines - samples_stats[idx][StatsKeys.avg_line_length] = \ - len(cur_text) / len(lines) if len(lines) != 0 else 0.0 - return samples
- -
[docs] def process_batched(self, samples): - if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_len <= stat[StatsKeys.avg_line_length] <= - self.max_len, samples[Fields.stats]) - else: - # single sample for ray filter - if self.min_len <= samples[Fields.stats][ - StatsKeys.avg_line_length] <= self.max_len: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/character_repetition_filter.html b/_modules/data_juicer/ops/filter/character_repetition_filter.html deleted file mode 100644 index f9391cd62..000000000 --- a/_modules/data_juicer/ops/filter/character_repetition_filter.html +++ /dev/null @@ -1,201 +0,0 @@ - - - - - - - - data_juicer.ops.filter.character_repetition_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.character_repetition_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-import numpy as np
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('character_repetition_filter') -class CharacterRepetitionFilter(Filter): - """Filter to keep samples with char-level n-gram repetition ratio within a - specific range.""" - - _batched_op = True - -
[docs] def __init__(self, - rep_len: PositiveInt = 10, - min_ratio: float = 0.0, - max_ratio: float = 0.5, - *args, - **kwargs): - """ - Initialization method. - - :param rep_len: Repetition length for char-level n-gram. - :param min_ratio: The min filter ratio in this op, samples will - be filtered if their char-level n-gram repetition ratio is - below this parameter. - :param max_ratio: The max filter ratio in this op, samples will - be filtered if their char-level n-gram repetition ratio - exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.n = rep_len - self.min_ratio = min_ratio - self.max_ratio = max_ratio
- -
[docs] def compute_stats_batched(self, samples): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - - for idx, stat in enumerate(samples_stats): - # check if it's computed already - if StatsKeys.char_rep_ratio in stat: - continue - - cur_text = samples_list[idx] - char_ngrams = [ - cur_text[i:i + self.n] - for i in range(len(cur_text) - self.n + 1) - ] - freq_char_ngrams = {} - for char_ngram in char_ngrams: - freq_char_ngrams[char_ngram] = ( - freq_char_ngrams.get(char_ngram, 0) + 1) - - if len(freq_char_ngrams) == 0: - samples_stats[idx][StatsKeys.char_rep_ratio] = 0.0 - continue - - freq_char_ngrams = sorted(list(freq_char_ngrams.values()), - reverse=True) - num_no_rep_char_ngrams = len( - [el for el in freq_char_ngrams if el == 1]) - num_rep_char_ngrams = min( - int(np.sqrt(len(freq_char_ngrams))), - len(freq_char_ngrams) - num_no_rep_char_ngrams, - ) - samples_stats[idx][StatsKeys.char_rep_ratio] = ( - sum(freq_char_ngrams[:num_rep_char_ngrams]) / - sum(freq_char_ngrams)) if sum(freq_char_ngrams) != 0 else 0.0 - - return samples
- -
[docs] def process_batched(self, samples): - if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_ratio <= stat[StatsKeys.char_rep_ratio] - <= self.max_ratio, samples[Fields.stats]) - else: - # single sample for ray filter - if self.min_ratio <= samples[Fields.stats][ - StatsKeys.char_rep_ratio] <= self.max_ratio: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/flagged_words_filter.html b/_modules/data_juicer/ops/filter/flagged_words_filter.html deleted file mode 100644 index a956d64ff..000000000 --- a/_modules/data_juicer/ops/filter/flagged_words_filter.html +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - - data_juicer.ops.filter.flagged_words_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.flagged_words_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from typing import List
-
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields, InterVars, StatsKeys
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ...utils.asset_utils import ASSET_DIR, load_words_asset
-from ..base_op import OPERATORS, Filter
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      words_refinement)
-from ..op_fusion import INTER_WORDS
-
-OP_NAME = 'flagged_words_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@INTER_WORDS.register_module(OP_NAME) -class FlaggedWordFilter(Filter): - """Filter to keep samples with flagged-word ratio less than a specific max - value.""" - -
[docs] def __init__(self, - lang: str = 'en', - tokenization: bool = False, - max_ratio: float = 0.045, - flagged_words_dir: str = ASSET_DIR, - use_words_aug: bool = False, - words_aug_group_sizes: List[PositiveInt] = [2], - words_aug_join_char: str = '', - *args, - **kwargs): - """ - Initialization method. - - :param lang: Consider flagged words in what language. If lang == - "all", we will adopt the one merged from all the available - languages - :param tokenization: Whether to use model to tokenize documents - :param max_ratio: The max filter ratio in this op. - :param flagged_words_dir: The directory storing the - flagged_words file(s) whose name includes "flagged_words" - and in json format - :param use_words_aug: Whether to augment words, especially for - Chinese and Vietnamese - :param words_aug_group_sizes: The group size of words to augment - :param words_aug_join_char: The join char between words to - augment - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.lang = lang - self.max_ratio = max_ratio - self.use_words_aug = use_words_aug - self.words_aug_group_sizes = words_aug_group_sizes - self.words_aug_join_char = words_aug_join_char - self.model_key = None - - self.FLAGGED_WORDS = load_words_asset(words_dir=flagged_words_dir, - words_type='flagged_words') - - if 'all' not in self.FLAGGED_WORDS: - self.FLAGGED_WORDS['all'] = [ - val for vals in self.FLAGGED_WORDS.values() for val in vals - ] - if tokenization: - self.model_key = prepare_model(model_type='sentencepiece', - lang=lang)
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.flagged_words_ratio in sample[Fields.stats]: - return sample - - # try to get words from context - words_key = f'{InterVars.words}-{self.model_key}' - if context and words_key in sample[Fields.context]: - words = sample[Fields.context][words_key] - else: - tokenizer = get_model(self.model_key) - words = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.encode_as_pieces if tokenizer else None) - if context: - sample[Fields.context][words_key] = words - - # try to get refined words from context - refined_words_key = f'{InterVars.refined_words}-True-SPECIAL_CHARS-' \ - f'{self.use_words_aug}-' \ - f'{self.words_aug_group_sizes}-' \ - f'{self.words_aug_join_char}' - if context and refined_words_key in sample[Fields.context]: - words = sample[Fields.context][refined_words_key] - else: - words = words_refinement( - words, - lower_case=True, - strip_chars=SPECIAL_CHARACTERS, - use_words_aug=self.use_words_aug, - words_aug_group_sizes=self.words_aug_group_sizes, - words_aug_join_char=self.words_aug_join_char) - if context: - sample[Fields.context][refined_words_key] = words - - flagged_words_ratio = (len( - [word - for word in words if word in self.FLAGGED_WORDS[self.lang]]) / - len(words)) if len(words) != 0 else 0.0 - - if flagged_words_ratio > 1.0: - flagged_words_ratio = 1.0 - - sample[Fields.stats][ - StatsKeys.flagged_words_ratio] = flagged_words_ratio - return sample
- -
[docs] def process_single(self, sample): - return sample[Fields.stats][ - StatsKeys.flagged_words_ratio] <= self.max_ratio
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_aesthetics_filter.html b/_modules/data_juicer/ops/filter/image_aesthetics_filter.html deleted file mode 100644 index 72610ece7..000000000 --- a/_modules/data_juicer/ops/filter/image_aesthetics_filter.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_aesthetics_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_aesthetics_filter

-import numpy as np
-from loguru import logger
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-
-from ...utils.model_utils import get_model, prepare_model
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_IMAGES
-
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'image_aesthetics_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageAestheticsFilter(Filter): - """Filter to keep samples with aesthetics scores within a specific range. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_scorer_model: str = '', - trust_remote_code: bool = False, - min_score: float = 0.5, - max_score: float = 1.0, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param hf_scorer_model: Huggingface model name for the aesthetics - predictor. By default, we will use - 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE', - refer to pypi.org/project/simple-aesthetics-predictor - :param min_score: Min score for the predicted aesthetics in an image. - :param max_score: Max score for the predicted aesthetics in an image. - :param any_or_all: Keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: Extra positional arguments. - :param kwargs: Extra keyword arguments. - """ - - super().__init__(*args, **kwargs) - if hf_scorer_model == '': - hf_scorer_model = \ - 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' - self.min_score = min_score - self.max_score = max_score - - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - - self.model_key = prepare_model( - model_type='simple_aesthetics', - pretrained_model_name_or_path=hf_scorer_model, - trust_remote_code=trust_remote_code) - # the original score predicted by laion-ai's scorer is within [0, 10] - self.need_normalized_by_ten = ('shunk031/aesthetics-predictor' - in hf_scorer_model)
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.image_aesthetics_scores in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.image_aesthetics_scores] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - # compute aesthetics_scores - model, processor = get_model(self.model_key, rank, self.use_cuda()) - inputs = processor(images=list(images.values()), - return_tensors='pt').to(model.device) - with torch.no_grad(): - outputs = model(**inputs) - if self.need_normalized_by_ten: - aesthetics_scores = outputs.logits / 10.0 - else: - aesthetics_scores = outputs.logits - - aesthetics_scores = [ - aesthetics_score.item() for aesthetics_score in aesthetics_scores - ] - - logger.debug(f'aesthetics_scores: {aesthetics_scores}') - - sample[Fields.stats][StatsKeys.image_aesthetics_scores] =\ - aesthetics_scores - return sample
- -
[docs] def process_single(self, sample): - aesthetics_scores = ( - sample)[Fields.stats][StatsKeys.image_aesthetics_scores] - if len(aesthetics_scores) <= 0: - return True - - keep_bools = np.array([ - self.min_score <= aesthetics_score <= self.max_score - for aesthetics_score in aesthetics_scores - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_aspect_ratio_filter.html b/_modules/data_juicer/ops/filter/image_aspect_ratio_filter.html deleted file mode 100644 index 035be3660..000000000 --- a/_modules/data_juicer/ops/filter/image_aspect_ratio_filter.html +++ /dev/null @@ -1,191 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_aspect_ratio_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_aspect_ratio_filter

-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_IMAGES
-
-
-
[docs]@OPERATORS.register_module('image_aspect_ratio_filter') -@LOADED_IMAGES.register_module('image_aspect_ratio_filter') -class ImageAspectRatioFilter(Filter): - """Filter to keep samples with image aspect ratio within a specific range. - AspectRatio = W / H. - """ - -
[docs] def __init__(self, - min_ratio: float = 0.333, - max_ratio: float = 3.0, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_ratio: The min aspect ratio to keep samples. - :param max_ratio: The max aspect ratio to keep samples. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_ratio = min_ratio - self.max_ratio = max_ratio - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.aspect_ratios in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.aspect_ratios] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - # compute aspect ratios for each image with W/H - aspect_ratios = { - key: (images[key].width / images[key].height) - for key in images - } - sample[Fields.stats][StatsKeys.aspect_ratios] = [ - aspect_ratios[key] for key in loaded_image_keys - ] - return sample
- -
[docs] def process_single(self, sample): - aspect_ratios = sample[Fields.stats][StatsKeys.aspect_ratios] - keep_bools = np.array([ - self.min_ratio <= aspect_ratio <= self.max_ratio - for aspect_ratio in aspect_ratios - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_face_count_filter.html b/_modules/data_juicer/ops/filter/image_face_count_filter.html deleted file mode 100644 index 1a184dbd0..000000000 --- a/_modules/data_juicer/ops/filter/image_face_count_filter.html +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_face_count_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_face_count_filter

-import os
-
-import numpy as np
-from loguru import logger
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context,
-                                        load_image)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, UNFORKABLE, Filter
-from ..op_fusion import LOADED_IMAGES
-
-cv2 = LazyLoader('cv2', 'cv2')
-
-OP_NAME = 'image_face_count_filter'
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageFaceCountFilter(Filter): - """Filter to keep samples with the number of faces within a specific range. - """ - - _default_kwargs = { - 'scaleFactor': 1.1, - 'minNeighbors': 3, - 'minSize': None, - 'maxSize': None, - } - -
[docs] def __init__(self, - cv_classifier: str = '', - min_face_count: int = 1, - max_face_count: int = 1, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param cv_classifier: OpenCV classifier path for face detection. - By default, we will use 'haarcascade_frontalface_alt.xml'. - :param min_face_count: Minimum number of faces required for samples. - :param max_face_count: Maximum number of faces required for samples. - :param any_or_all: Keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: Extra positional arguments. - :param kwargs: Extra keyword arguments. - """ - super().__init__(*args, **kwargs) - - if cv_classifier == '': - cv_classifier = os.path.join(cv2.data.haarcascades, - 'haarcascade_frontalface_alt.xml') - - self.min_face_count = min_face_count - self.max_face_count = max_face_count - - self.extra_kwargs = self._default_kwargs - for key in kwargs: - if key in self.extra_kwargs: - self.extra_kwargs[key] = kwargs[key] - - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - - self.model_key = prepare_model(model_type='opencv_classifier', - model_path=cv_classifier)
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.face_ratios in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.face_counts] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - model = get_model(self.model_key) - - # count the number of detected faces in each image - face_counts = {} - try: - for key, image in images.items(): - dets = detect_faces(image, model, **self.extra_kwargs) - face_counts[key] = len(dets) - logger.debug(f'face counts: {face_counts}') - except Exception as e: - logger.exception(e) - - sample[Fields.stats][StatsKeys.face_counts] = [ - face_counts[key] for key in loaded_image_keys - ] - return sample
- -
[docs] def process_single(self, sample): - face_counts = sample[Fields.stats][StatsKeys.face_counts] - if len(face_counts) <= 0: - return True - - keep_bools = np.array([ - self.min_face_count <= face_count <= self.max_face_count - for face_count in face_counts - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_face_ratio_filter.html b/_modules/data_juicer/ops/filter/image_face_ratio_filter.html deleted file mode 100644 index 44006b127..000000000 --- a/_modules/data_juicer/ops/filter/image_face_ratio_filter.html +++ /dev/null @@ -1,237 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_face_ratio_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_face_ratio_filter

-import os
-
-import numpy as np
-from loguru import logger
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context,
-                                        load_image)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, UNFORKABLE, Filter
-from ..op_fusion import LOADED_IMAGES
-
-cv2 = LazyLoader('cv2', 'cv2')
-
-OP_NAME = 'image_face_ratio_filter'
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageFaceRatioFilter(Filter): - """Filter to keep samples with face area ratios within a specific range. - """ - - _default_kwargs = { - 'scaleFactor': 1.1, - 'minNeighbors': 3, - 'minSize': None, - 'maxSize': None, - } - -
[docs] def __init__(self, - cv_classifier: str = '', - min_ratio: float = 0.0, - max_ratio: float = 0.4, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param cv_classifier: OpenCV classifier path for face detection. - By default, we will use 'haarcascade_frontalface_alt.xml'. - :param min_ratio: Min ratio for the largest face area in an image. - :param max_ratio: Max ratio for the largest face area in an image. - :param any_or_all: Keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: Extra positional arguments. - :param kwargs: Extra keyword arguments. - """ - super().__init__(*args, **kwargs) - - if cv_classifier == '': - cv_classifier = os.path.join(cv2.data.haarcascades, - 'haarcascade_frontalface_alt.xml') - self.min_ratio = min_ratio - self.max_ratio = max_ratio - - self.extra_kwargs = self._default_kwargs - for key in kwargs: - if key in self.extra_kwargs: - self.extra_kwargs[key] = kwargs[key] - - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - - self.model_key = prepare_model(model_type='opencv_classifier', - model_path=cv_classifier)
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.face_ratios in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.face_ratios] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - model = get_model(self.model_key) - - # detect faces - face_detections = {} - for key, image in images.items(): - face_detections[key] = detect_faces(image, model, - **self.extra_kwargs) - logger.debug(f'detections: {face_detections}') - - # compute face area ratios for each image considering the largest face - face_area_ratios = {} - for key, dets in face_detections.items(): - image_area = images[key].width * images[key].height - face_area_ratios[key] = max([w * h for _, _, w, h in dets], - default=0.0) / image_area - logger.debug(f'ratios: {face_area_ratios}') - - sample[Fields.stats][StatsKeys.face_ratios] = [ - face_area_ratios[key] for key in loaded_image_keys - ] - return sample
- -
[docs] def process_single(self, sample): - face_ratios = sample[Fields.stats][StatsKeys.face_ratios] - if len(face_ratios) <= 0: - return True - - keep_bools = np.array([ - self.min_ratio <= face_ratio <= self.max_ratio - for face_ratio in face_ratios - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_nsfw_filter.html b/_modules/data_juicer/ops/filter/image_nsfw_filter.html deleted file mode 100644 index 4b248ecad..000000000 --- a/_modules/data_juicer/ops/filter/image_nsfw_filter.html +++ /dev/null @@ -1,206 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_nsfw_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_nsfw_filter

-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_IMAGES
-
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'image_nsfw_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageNSFWFilter(Filter): - """Filter to keep samples whose images have low nsfw scores.""" - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', - trust_remote_code: bool = False, - score_threshold: float = 0.5, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param hf_nsfw_model: nsfw detection model name on huggingface. - :param score_threshold: the nsfw score threshold for samples. - range from 0 to 1. Samples with nsfw score less than this threshold - will be kept. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.score_threshold = score_threshold - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_nsfw_model, - trust_remote_code=trust_remote_code)
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.image_nsfw_score in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.image_nsfw_score] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - images = [images[key] for key in images] - inputs = processor(images=images, return_tensors='pt').to(model.device) - outputs = model(**inputs) - logits = outputs.logits - nsfw_scores = [ - float(scores[1]) for scores in torch.softmax(logits, dim=-1) - ] - - sample[Fields.stats][StatsKeys.image_nsfw_score] = nsfw_scores - - return sample
- -
[docs] def process_single(self, sample, rank=None): - itm_scores = sample[Fields.stats][StatsKeys.image_nsfw_score] - if len(itm_scores) <= 0: - return True - - keep_bools = np.array( - [itm_score < self.score_threshold for itm_score in itm_scores]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_pair_similarity_filter.html b/_modules/data_juicer/ops/filter/image_pair_similarity_filter.html deleted file mode 100644 index 60dda0195..000000000 --- a/_modules/data_juicer/ops/filter/image_pair_similarity_filter.html +++ /dev/null @@ -1,217 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_pair_similarity_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_pair_similarity_filter

-import numpy as np
-from jsonargparse.typing import ClosedUnitInterval
-
-from data_juicer.ops.base_op import OPERATORS, Filter
-from data_juicer.ops.op_fusion import LOADED_IMAGES
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'image_pair_similarity_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImagePairSimilarityFilter(Filter): - """Filter to keep image pairs with similarities between images - within a specific range.""" - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_clip='openai/clip-vit-base-patch32', - trust_remote_code=False, - min_score: ClosedUnitInterval = 0.1, - max_score: ClosedUnitInterval = 1.0, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param hf_clip: clip model name on huggingface to compute - the similarity between image and text. - :param min_score: The min similarity to keep samples. - :param max_score: The max similarity to keep samples. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_score = min_score - self.max_score = max_score - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model(model_type='huggingface', - pretrained_model_name_or_path=hf_clip, - trust_remote_code=trust_remote_code)
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - - # check if it's computed already - if StatsKeys.image_pair_similarity in sample[Fields.stats]: - return sample - - # there is no image in this sample - if (self.image_key not in sample - or not len(sample[self.image_key]) == 2 - or sample[self.image_key][0] == sample[self.image_key][1]): - raise ValueError('Each sample must include two images.') - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - similarity = [] - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - image_list = [] - for temp_key in images.keys(): - image_list.append(images[temp_key]) - image_tensors = processor.image_processor( - image_list, return_tensors='pt')['pixel_values'] - image1_batch_feature = model.get_image_features( - image_tensors[0].unsqueeze(0).to(model.device)) - image2_batch_feature = model.get_image_features( - image_tensors[1].unsqueeze(0).to(model.device)) - - similarity = torch.cosine_similarity(image1_batch_feature, - image2_batch_feature, - dim=1) - sample[Fields.stats][StatsKeys.image_pair_similarity] = similarity - - return sample
- -
[docs] def process_single(self, sample, rank=None): - similarity = sample[Fields.stats][StatsKeys.image_pair_similarity] - if len(similarity) <= 0: - return True - - keep_bools = np.array([ - self.min_score <= sim_value <= self.max_score - for sim_value in similarity - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_shape_filter.html b/_modules/data_juicer/ops/filter/image_shape_filter.html deleted file mode 100644 index 49d26401e..000000000 --- a/_modules/data_juicer/ops/filter/image_shape_filter.html +++ /dev/null @@ -1,203 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_shape_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_shape_filter

-import sys
-
-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_IMAGES
-
-
-
[docs]@OPERATORS.register_module('image_shape_filter') -@LOADED_IMAGES.register_module('image_shape_filter') -class ImageShapeFilter(Filter): - """Filter to keep samples with image shape (w, h) within specific ranges. - """ - -
[docs] def __init__(self, - min_width: int = 1, - max_width: int = sys.maxsize, - min_height: int = 1, - max_height: int = sys.maxsize, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_width: The min width to keep samples. - :param max_width: The max width to keep samples. - :param min_height: The min height to keep samples. - :param max_height: The max height to keep samples. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_width = min_width - self.max_width = max_width - self.min_height = min_height - self.max_height = max_height - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.image_width in sample[Fields.stats] \ - and StatsKeys.image_height in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.image_width] = np.array( - [], dtype=np.int64) - sample[Fields.stats][StatsKeys.image_height] = np.array( - [], dtype=np.int64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - # get width and height for each image - whs = {key: (images[key].width, images[key].height) for key in images} - sample[Fields.stats][StatsKeys.image_width] = [ - whs[key][0] for key in loaded_image_keys - ] - sample[Fields.stats][StatsKeys.image_height] = [ - whs[key][1] for key in loaded_image_keys - ] - return sample
- -
[docs] def process_single(self, sample): - ws = sample[Fields.stats][StatsKeys.image_width] - hs = sample[Fields.stats][StatsKeys.image_height] - if len(ws) <= 0: - return True - keep_bools = np.array([ - self.min_width <= w <= self.max_width - and self.min_height <= h <= self.max_height - for w, h in zip(ws, hs) - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_size_filter.html b/_modules/data_juicer/ops/filter/image_size_filter.html deleted file mode 100644 index c49feb27c..000000000 --- a/_modules/data_juicer/ops/filter/image_size_filter.html +++ /dev/null @@ -1,183 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_size_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_size_filter

-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import get_file_size, size_to_bytes
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('image_size_filter') -class ImageSizeFilter(Filter): - """Keep data samples whose image size (in Bytes/KB/MB/...) within a - specific range. - """ - -
[docs] def __init__(self, - min_size: str = '0', - max_size: str = '1TB', - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_size: The min image size to keep samples. set to be "0" by - default for no size constraint - :param max_size: The max image size to keep samples. set to be - "1TB" by default, an approximate for un-limited case - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_size = size_to_bytes(min_size) - self.max_size = size_to_bytes(max_size) - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.image_sizes in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.image_sizes] = np.array( - [], dtype=np.float64) - return sample - - # for size calculation, no need to load images into memory - sample[Fields.stats][StatsKeys.image_sizes] = [ - get_file_size(img_path) for img_path in sample[self.image_key] - ] - - return sample
- -
[docs] def process_single(self, sample): - image_sizes = sample[Fields.stats][StatsKeys.image_sizes] - keep_bools = np.array([ - self.min_size <= image_size <= self.max_size - for image_size in image_sizes - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_text_matching_filter.html b/_modules/data_juicer/ops/filter/image_text_matching_filter.html deleted file mode 100644 index a8f911b06..000000000 --- a/_modules/data_juicer/ops/filter/image_text_matching_filter.html +++ /dev/null @@ -1,260 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_text_matching_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_text_matching_filter

-import numpy as np
-from PIL import ImageOps
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context,
-                                        load_image, remove_special_tokens)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_IMAGES
-
-OP_NAME = 'image_text_matching_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageTextMatchingFilter(Filter): - """Filter to keep samples those matching score between image and text - within a specific range.""" - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_blip: str = 'Salesforce/blip-itm-base-coco', - trust_remote_code: bool = False, - min_score: float = 0.003, - max_score: float = 1.0, - horizontal_flip: bool = False, - vertical_flip: bool = False, - any_or_all: str = 'any', - reduce_mode: str = 'avg', - *args, - **kwargs): - """ - Initialization method. - - :param hf_blip: blip model name on huggingface to compute - the matching score between image and text. - :param min_score: The min matching score to keep samples. - :param max_score: The max matching score to keep samples. - :param horizontal_flip: Flip image horizontally (left to right). - :param vertical_flip: Flip image vertically (top to bottom). - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param reduce_mode: reduce mode when one text corresponds to - multiple images in a chunk. - 'avg': Take the average of multiple values - 'max': Take the max of multiple values - 'min': Take the min of multiple values - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_score = min_score - self.max_score = max_score - if reduce_mode not in ['avg', 'max', 'min']: - raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. ' - f'Can only be one of ["avg", "max", "min"].') - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model(model_type='huggingface', - pretrained_model_name_or_path=hf_blip, - trust_remote_code=trust_remote_code) - self.reduce_mode = reduce_mode - self.horizontal_flip = horizontal_flip - self.vertical_flip = vertical_flip
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.image_text_matching_score in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][ - StatsKeys.image_text_matching_score] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - text = sample[self.text_key] - offset = 0 - matching_scores = [] - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - for chunk in text.split(SpecialTokens.eoc): - count = chunk.count(SpecialTokens.image) - - # no image or no text - if count == 0 or len(chunk) == 0: - continue - else: - text_chunk = remove_special_tokens(chunk) - image_chunk = [] - for image_key in loaded_image_keys[offset:offset + count]: - image = images[image_key] - if self.horizontal_flip: - image = ImageOps.mirror(image) - if self.vertical_flip: - image = ImageOps.flip(image) - image_chunk.append(image) - - inputs = processor(text=text_chunk, - images=image_chunk, - return_tensors='pt', - truncation=True, - max_length=model.config.text_config. - max_position_embeddings, - padding=True).to(model.device) - - outputs = model(**inputs) - itm_scores = outputs.itm_score.detach().cpu().softmax( - dim=-1)[:, 1] - - if self.reduce_mode == 'avg': - chunk_itm_score = itm_scores.mean() - elif self.reduce_mode == 'max': - chunk_itm_score = itm_scores.max() - else: - chunk_itm_score = itm_scores.min() - - matching_scores.append(float(chunk_itm_score)) - offset += count - sample[Fields.stats][ - StatsKeys.image_text_matching_score] = matching_scores - - return sample
- -
[docs] def process_single(self, sample, rank=None): - itm_scores = sample[Fields.stats][StatsKeys.image_text_matching_score] - if len(itm_scores) <= 0: - return True - - keep_bools = np.array([ - self.min_score <= itm_score <= self.max_score - for itm_score in itm_scores - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_text_similarity_filter.html b/_modules/data_juicer/ops/filter/image_text_similarity_filter.html deleted file mode 100644 index 20593a03a..000000000 --- a/_modules/data_juicer/ops/filter/image_text_similarity_filter.html +++ /dev/null @@ -1,257 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_text_similarity_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_text_similarity_filter

-import numpy as np
-from PIL import ImageOps
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context,
-                                        load_image, remove_special_tokens)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_IMAGES
-
-OP_NAME = 'image_text_similarity_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageTextSimilarityFilter(Filter): - """Filter to keep samples those similarities between image and text - within a specific range.""" - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_clip: str = 'openai/clip-vit-base-patch32', - trust_remote_code: bool = False, - min_score: float = 0.1, - max_score: float = 1.0, - horizontal_flip: bool = False, - vertical_flip: bool = False, - any_or_all: str = 'any', - reduce_mode: str = 'avg', - *args, - **kwargs): - """ - Initialization method. - - :param hf_clip: clip model name on huggingface to compute - the similarity between image and text. - :param min_score: The min similarity to keep samples. - :param max_score: The max similarity to keep samples. - :param horizontal_flip: Flip image horizontally (left to right). - :param vertical_flip: Flip image vertically (top to bottom). - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param reduce_mode: reduce mode when one text corresponds to - multiple images in a chunk. - 'avg': Take the average of multiple values - 'max': Take the max of multiple values - 'min': Take the min of multiple values - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_score = min_score - self.max_score = max_score - if reduce_mode not in ['avg', 'max', 'min']: - raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. ' - f'Can only be one of ["avg", "max", "min"].') - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model(model_type='huggingface', - pretrained_model_name_or_path=hf_clip, - trust_remote_code=trust_remote_code) - self.reduce_mode = reduce_mode - self.horizontal_flip = horizontal_flip - self.vertical_flip = vertical_flip
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.image_text_similarity in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.image_text_similarity] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - text = sample[self.text_key] - offset = 0 - similarity = [] - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - for chunk in text.split(SpecialTokens.eoc): - count = chunk.count(SpecialTokens.image) - - # no image or no text - if count == 0 or len(chunk) == 0: - continue - else: - text_chunk = remove_special_tokens(chunk) - image_chunk = [] - for image_key in loaded_image_keys[offset:offset + count]: - image = images[image_key] - if self.horizontal_flip: - image = ImageOps.mirror(image) - if self.vertical_flip: - image = ImageOps.flip(image) - image_chunk.append(image) - - inputs = processor(text=text_chunk, - images=image_chunk, - return_tensors='pt', - truncation=True, - max_length=model.config.text_config. - max_position_embeddings, - padding=True).to(model.device) - - outputs = model(**inputs) - chunk_logits = outputs.logits_per_text / 100.0 - - if self.reduce_mode == 'avg': - chunk_similarity = chunk_logits.mean() - elif self.reduce_mode == 'max': - chunk_similarity = chunk_logits.max() - else: - chunk_similarity = chunk_logits.min() - - similarity.append(float(chunk_similarity)) - offset += count - sample[Fields.stats][StatsKeys.image_text_similarity] = similarity - - return sample
- -
[docs] def process_single(self, sample, rank=None): - similarity = sample[Fields.stats][StatsKeys.image_text_similarity] - if len(similarity) <= 0: - return True - - keep_bools = np.array([ - self.min_score <= sim_value <= self.max_score - for sim_value in similarity - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/image_watermark_filter.html b/_modules/data_juicer/ops/filter/image_watermark_filter.html deleted file mode 100644 index fbe8d38ab..000000000 --- a/_modules/data_juicer/ops/filter/image_watermark_filter.html +++ /dev/null @@ -1,210 +0,0 @@ - - - - - - - - data_juicer.ops.filter.image_watermark_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.image_watermark_filter

-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_IMAGES
-
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'image_watermark_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageWatermarkFilter(Filter): - """ - Filter to keep samples whose images have no watermark with high - probability. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_watermark_model: str = 'amrul-hzz/watermark_detector', - trust_remote_code: bool = False, - prob_threshold: float = 0.8, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param hf_watermark_model: watermark detection model name on - huggingface. - :param prob_threshold: the predicted watermark probability threshold - for samples. range from 0 to 1. Samples with watermark probability - less than this threshold will be kept. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.prob_threshold = prob_threshold - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_watermark_model, - trust_remote_code=trust_remote_code)
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.image_watermark_prob in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.image_watermark_prob] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - images = [images[key] for key in images] - inputs = processor(images=images, return_tensors='pt').to(model.device) - outputs = model(**inputs) - logits = outputs.logits - watermark_probs = [ - float(probs[1]) for probs in torch.softmax(logits, dim=-1) - ] - - sample[Fields.stats][StatsKeys.image_watermark_prob] = watermark_probs - - return sample
- -
[docs] def process_single(self, sample, rank=None): - itm_probs = sample[Fields.stats][StatsKeys.image_watermark_prob] - if len(itm_probs) <= 0: - return True - - keep_bools = np.array( - [itm_prob < self.prob_threshold for itm_prob in itm_probs]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/language_id_score_filter.html b/_modules/data_juicer/ops/filter/language_id_score_filter.html deleted file mode 100644 index 2d92774d4..000000000 --- a/_modules/data_juicer/ops/filter/language_id_score_filter.html +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - data_juicer.ops.filter.language_id_score_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.language_id_score_filter

-from typing import List, Union
-
-from loguru import logger
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-
-fasttext = LazyLoader('fasttext', 'fasttext')
-
-OP_NAME = 'language_id_score_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class LanguageIDScoreFilter(Filter): - """Filter to keep samples in a specific language with confidence score - larger than a specific min value.""" - -
[docs] def __init__(self, - lang: Union[str, List[str]] = '', - min_score: float = 0.8, - *args, - **kwargs): - """ - Initialization method. - - :param lang: Samples in which languages to keep. - :param min_score: The min language identification confidence - scores of samples to keep. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if not lang: - # lang is [], '' or None - self.lang = None - elif isinstance(lang, str): - # lang is a single language string - self.lang = [lang] - else: - # lang is a list of multiple languages - self.lang = lang - self.min_score = min_score - self.model_key = prepare_model(model_type='fasttext')
- -
[docs] def compute_stats_single(self, sample): - # check if it's computed already - if StatsKeys.lang in sample[ - Fields.stats] and StatsKeys.lang_score in sample[Fields.stats]: - return sample - - text = sample[self.text_key].lower().replace('\n', ' ') - ft_model = get_model(self.model_key) - if ft_model is None: - err_msg = 'Model not loaded. Please retry later.' - logger.error(err_msg) - raise ValueError(err_msg) - pred = ft_model.predict(text) - lang_id = pred[0][0].replace('__label__', '') - lang_score = pred[1][0] - - sample[Fields.stats][StatsKeys.lang] = lang_id - sample[Fields.stats][StatsKeys.lang_score] = lang_score - - return sample
- -
[docs] def process_single(self, sample): - if self.lang: - return sample[Fields.stats][StatsKeys.lang] in self.lang \ - and sample[Fields.stats][StatsKeys.lang_score] >= \ - self.min_score - else: - return sample[Fields.stats][StatsKeys.lang_score] >= self.min_score
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html b/_modules/data_juicer/ops/filter/maximum_line_length_filter.html deleted file mode 100644 index d67332088..000000000 --- a/_modules/data_juicer/ops/filter/maximum_line_length_filter.html +++ /dev/null @@ -1,182 +0,0 @@ - - - - - - - - data_juicer.ops.filter.maximum_line_length_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.maximum_line_length_filter

-import sys
-
-from data_juicer.utils.constant import Fields, InterVars, StatsKeys
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_LINES
-
-OP_NAME = 'maximum_line_length_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@INTER_LINES.register_module(OP_NAME) -class MaximumLineLengthFilter(Filter): - """Filter to keep samples with maximum line length within a specific - range.""" - - _batched_op = True - -
[docs] def __init__(self, - min_len: int = 10, - max_len: int = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param min_len: The min filter length in this op, samples will - be filtered if their maximum line length is below this - parameter. - :param max_len: The max filter length in this op, samples will - be filtered if their maximum line length exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_len = min_len - self.max_len = max_len
- -
[docs] def compute_stats_batched(self, samples, context=False): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - context_key = f'{InterVars.lines}' - - for idx, stat in enumerate(samples_stats): - # check if it's computed already - if StatsKeys.max_line_length in stat: - continue - - if context and context_key in samples[Fields.context][idx]: - lines = samples[Fields.context][idx][context_key] - else: - lines = samples_list[idx].splitlines() - if context: - samples[Fields.context][idx][context_key] = lines - line_lengths = list(map(len, lines)) - samples_stats[idx][StatsKeys.max_line_length] = max( - line_lengths) if line_lengths else 0 - - return samples
- -
[docs] def process_batched(self, samples): - if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_len <= stat[StatsKeys.max_line_length] <= - self.max_len, samples[Fields.stats]) - else: - # single sample for ray filter - if self.min_len <= samples[Fields.stats][ - StatsKeys.max_line_length] <= self.max_len: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/perplexity_filter.html b/_modules/data_juicer/ops/filter/perplexity_filter.html deleted file mode 100644 index 465e95847..000000000 --- a/_modules/data_juicer/ops/filter/perplexity_filter.html +++ /dev/null @@ -1,190 +0,0 @@ - - - - - - - - data_juicer.ops.filter.perplexity_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.perplexity_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from data_juicer.utils.constant import Fields, InterVars, StatsKeys
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..common import get_words_from_document
-from ..op_fusion import INTER_WORDS
-
-OP_NAME = 'perplexity_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@INTER_WORDS.register_module(OP_NAME) -class PerplexityFilter(Filter): - """Filter to keep samples with perplexity score less than a specific max - value.""" - - _batched_op = True - -
[docs] def __init__(self, - lang: str = 'en', - max_ppl: float = 1500, - *args, - **kwargs): - """ - Initialization method. - - :param lang: Compute perplexity for samples in which language. - :param max_ppl: The max filter perplexity in this op, samples - will be filtered if their perplexity exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.max_ppl = max_ppl - self.lang = lang - self.sp_model_key = prepare_model(model_type='sentencepiece', - lang=lang) - self.kl_model_key = prepare_model(model_type='kenlm', lang=lang)
- -
[docs] def compute_stats_batched(self, samples, context=False): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - words_key = f'{InterVars.words}-{self.sp_model_key}' - - for idx, stat in enumerate(samples_stats): - # check if it's computed already - if StatsKeys.perplexity in stat: - continue - # tokenization - if context and words_key in samples[Fields.context][idx]: - words = samples[Fields.context][idx][words_key] - else: - tokenizer = get_model(self.sp_model_key) - words = get_words_from_document( - samples_list[idx], - token_func=tokenizer.encode_as_pieces - if tokenizer else None) - if context: - samples[Fields.context][idx][words_key] = words - text = ' '.join(words) - # compute perplexity - logits, length = 0, 0 - kenlm_model = get_model(self.kl_model_key) - for line in text.splitlines(): - logits += kenlm_model.score(line) - length += (len(line.split()) + 1) - ppl = (10.0**(-logits / length)) if length != 0 else 0.0 - samples_stats[idx][StatsKeys.perplexity] = round(ppl, 1) - - return samples
- -
[docs] def process_batched(self, samples): - if isinstance(samples[Fields.stats], list): - return map(lambda stat: stat[StatsKeys.perplexity] <= self.max_ppl, - samples[Fields.stats]) - else: - return samples[Fields.stats][StatsKeys.perplexity] <= self.max_ppl
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/phrase_grounding_recall_filter.html b/_modules/data_juicer/ops/filter/phrase_grounding_recall_filter.html deleted file mode 100644 index 53852bd2c..000000000 --- a/_modules/data_juicer/ops/filter/phrase_grounding_recall_filter.html +++ /dev/null @@ -1,379 +0,0 @@ - - - - - - - - data_juicer.ops.filter.phrase_grounding_recall_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.phrase_grounding_recall_filter

-from typing import List
-
-import numpy as np
-from loguru import logger
-from PIL import ImageOps
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (SpecialTokens, iou,
-                                        load_data_with_context, load_image,
-                                        remove_special_tokens)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_IMAGES
-
-torch = LazyLoader('torch', 'torch')
-nltk = LazyLoader('nltk', 'nltk')
-
-OP_NAME = 'phrase_grounding_recall_filter'
-
-
-# NER algorithm adapted from GLIP starts
-# https://github.com/microsoft/GLIP/blob/main/maskrcnn_benchmark/engine/predictor_glip.py#L107-L127
-def find_noun_phrases(caption: str) -> List[str]:
-    caption = caption.lower()
-    tokens = nltk.word_tokenize(caption)
-    pos_tags = nltk.pos_tag(tokens)
-
-    grammar = 'NP: {<DT>?<JJ.*>*<NN.*>+}'
-    cp = nltk.RegexpParser(grammar)
-    result = cp.parse(pos_tags)
-
-    noun_phrases = list()
-    for subtree in result.subtrees():
-        if subtree.label() == 'NP':
-            noun_phrases.append(' '.join(t[0] for t in subtree.leaves()))
-
-    return noun_phrases
-
-
-def remove_punctuation(text: str) -> str:
-    punct = [
-        '|', ':', ';', '@', '(', ')', '[', ']', '{', '}', '^', '\'', '\"', '’',
-        '`', '?', '$', '%', '#', '!', '&', '*', '+', ',', '.'
-    ]
-    for p in punct:
-        text = text.replace(p, '')
-    return text.strip()
-
-
-def run_ner(caption):
-    noun_phrases = find_noun_phrases(caption)
-    noun_phrases = [remove_punctuation(phrase) for phrase in noun_phrases]
-    noun_phrases = [phrase for phrase in noun_phrases if phrase != '']
-    noun_phrases = list(set(noun_phrases))  # remove duplicate ners
-    return noun_phrases
-
-
-# NER algorithm adapted from GLIP ends
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class PhraseGroundingRecallFilter(Filter): - """Filter to keep samples whose locating recalls of phrases extracted - from text in the images are within a specified range.""" - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_owlvit: str = 'google/owlvit-base-patch32', - trust_remote_code: bool = False, - min_recall: float = 0.1, - max_recall: float = 1.0, - horizontal_flip: bool = False, - vertical_flip: bool = False, - any_or_all: str = 'any', - reduce_mode: str = 'avg', - iou_thr: float = 0.5, - large_area_ratio_thr: float = 0.95, - conf_thr: float = 0.0, - *args, - **kwargs): - """ - Initialization method. - - :param hf_owlvit: Owl-ViT model name on huggingface to locate the - phrases extracted from the text. - :param min_recall: The min phrase grounding recall to keep samples. - :param max_recall: The max phrase grounding recall to keep samples. - :param horizontal_flip: Flip image horizontally (left to right). - :param vertical_flip: Flip image vertically (top to bottom). - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param reduce_mode: reduce mode when one text corresponds to - multiple images in a chunk. - 'avg': Take the average of multiple values - 'max': Take the max of multiple values - 'min': Take the min of multiple values - :param iou_thr: the IoU threshold for NMS-like post-process. If two - predicted bboxes are overlap with an IoU larger than this - threshold, the bbox with less confidence will be removed. Default: - 0.5. - :param large_area_ratio_thr: the area ratio threshold for filtering out - those large predicted bboxes. If the area of a predicted bbox - accounts for more than this ratio threshold of the whole image - area, this bbox will be removed. Default: 0.95. - :param conf_thr: the confidence score threshold for removing - low-confidence bboxes. If the confidence score of a predicted bbox - is lower than the threshold, this bbox will be removed. Default: 0. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_recall = min_recall - self.max_recall = max_recall - if reduce_mode not in ['avg', 'max', 'min']: - raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. ' - f'Can only be one of ["avg", "max", "min"].') - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model(model_type='huggingface', - pretrained_model_name_or_path=hf_owlvit, - trust_remote_code=trust_remote_code) - self.reduce_mode = reduce_mode - self.horizontal_flip = horizontal_flip - self.vertical_flip = vertical_flip - - self.iou_thr = iou_thr - self.large_area_ratio_thr = large_area_ratio_thr - self.conf_thr = conf_thr - - requires_nltk_data = ['punkt', 'averaged_perceptron_tagger'] - logger.info(f'Downloading nltk data of {requires_nltk_data}...') - for nltk_data_pkg in requires_nltk_data: - nltk.download(nltk_data_pkg)
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.phrase_grounding_recall in sample[Fields.stats]: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.stats][StatsKeys.phrase_grounding_recall] = np.array( - [], dtype=np.float64) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - text = sample[self.text_key] - offset = 0 - recalls = [] - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - for chunk in text.split(SpecialTokens.eoc): - count = chunk.count(SpecialTokens.image) - - # no image or no text - if count == 0 or len(chunk) == 0: - continue - else: - text_this_chunk = remove_special_tokens(chunk) - ners_this_chunk = run_ner(text_this_chunk) - num_ners = len(ners_this_chunk) - if num_ners <= 0: - # no ners found, just skip this chunk - recalls.append(1.0) - continue - images_this_chunk = [] - for image_key in loaded_image_keys[offset:offset + count]: - image = images[image_key] - if self.horizontal_flip: - image = ImageOps.mirror(image) - if self.vertical_flip: - image = ImageOps.flip(image) - images_this_chunk.append(image) - - ners_batch = [ners_this_chunk] * len(images_this_chunk) - inputs = processor(text=ners_batch, - images=images_this_chunk, - return_tensors='pt', - padding=True, - truncation=True).to(model.device) - - with torch.no_grad(): - outputs = model(**inputs) - target_sizes = torch.tensor([ - img.size[::-1] for img in images_this_chunk - ]).to(model.device) - results = processor.post_process_object_detection( - outputs, - threshold=self.conf_thr, - target_sizes=target_sizes) - - image_recalls = [] - for idx, result in enumerate(results): - scores = result['scores'] - labels = result['labels'] - boxes = result['boxes'] - - # sort by the confidence scores - # and only keep the first num_ners predictions - order_idx = scores.argsort(descending=True) - scores = scores[order_idx].tolist()[:num_ners] - labels = labels[order_idx].tolist()[:num_ners] - boxes = boxes[order_idx].tolist()[:num_ners] - - image_area = target_sizes[idx].prod() - hit = {} - for box, label, score in zip(boxes, labels, scores): - # this ner is already hit - if ners_this_chunk[label] in hit: - continue - # skip boxes nearly cover the whole image - xmin, ymin, xmax, ymax = box - box_area = (xmax - xmin) * (ymax - ymin) - if 1.0 * box_area / image_area > \ - self.large_area_ratio_thr: - continue - # skip overlapped boxes with nms-like method - suppressed = False - for ner in hit: - if iou(box, hit[ner][0]) > self.iou_thr: - suppressed = True - break - if suppressed: - continue - - # record the new hit box - hit[ners_this_chunk[label]] = (box, score) - - recall = 1.0 * len(hit) / num_ners - image_recalls.append(recall) - - if self.reduce_mode == 'avg': - image_recall = sum(image_recalls) / len(image_recalls) - elif self.reduce_mode == 'max': - image_recall = max(image_recalls) - else: - image_recall = min(image_recalls) - - recalls.append(image_recall) - offset += count - sample[Fields.stats][StatsKeys.phrase_grounding_recall] = recalls - - return sample
- -
[docs] def process_single(self, sample): - recalls = sample[Fields.stats][StatsKeys.phrase_grounding_recall] - if len(recalls) <= 0: - return True - - keep_bools = np.array([ - self.min_recall <= recall <= self.max_recall for recall in recalls - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/special_characters_filter.html b/_modules/data_juicer/ops/filter/special_characters_filter.html deleted file mode 100644 index 79cc1f746..000000000 --- a/_modules/data_juicer/ops/filter/special_characters_filter.html +++ /dev/null @@ -1,177 +0,0 @@ - - - - - - - - data_juicer.ops.filter.special_characters_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.special_characters_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from data_juicer.utils.constant import Fields, StatsKeys
-
-from ..base_op import OPERATORS, Filter
-from ..common import SPECIAL_CHARACTERS
-
-
-
[docs]@OPERATORS.register_module('special_characters_filter') -class SpecialCharactersFilter(Filter): - """Filter to keep samples with special-char ratio within a specific - range.""" - - _batched_op = True - -
[docs] def __init__(self, - min_ratio: float = 0.0, - max_ratio: float = 0.25, - *args, - **kwargs): - """ - Initialization method. - - :param min_ratio: The min filter ratio in this op, samples will - be filtered if their special-char ratio is below this - parameter. - :param max_ratio: The max filter ratio in this op, samples will - be filtered if their special-char ratio exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_ratio = min_ratio - self.max_ratio = max_ratio
- -
[docs] def compute_stats_batched(self, samples): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - - for idx, stat in enumerate(samples_stats): - # check if it's computed already - if StatsKeys.special_char_ratio in stat: - continue - cur_text = samples_list[idx] - # get ratio of special characters - samples_stats[idx][StatsKeys.special_char_ratio] = ( - len([c for c in cur_text if c in SPECIAL_CHARACTERS]) / - len(cur_text)) if len(cur_text) != 0 else 0.0 - - return samples
- -
[docs] def process_batched(self, samples): - if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_ratio <= stat[ - StatsKeys.special_char_ratio] <= self.max_ratio, - samples[Fields.stats]) - else: - # single sample for ray filter - if self.min_ratio <= \ - samples[Fields.stats][StatsKeys.special_char_ratio] \ - <= self.max_ratio: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/specified_field_filter.html b/_modules/data_juicer/ops/filter/specified_field_filter.html deleted file mode 100644 index 9bd839aca..000000000 --- a/_modules/data_juicer/ops/filter/specified_field_filter.html +++ /dev/null @@ -1,164 +0,0 @@ - - - - - - - - data_juicer.ops.filter.specified_field_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.specified_field_filter

-from typing import List
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('specified_field_filter') -class SpecifiedFieldFilter(Filter): - """ - Filter based on specified field information. - - If the specified field information in the sample is not within the - specified target value, the sample will be filtered. - """ - -
[docs] def __init__(self, - field_key: str = '', - target_value: List = [], - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Filter based on the specified value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param target_value: The range of specified field information - corresponding to the samples that need to be retained. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.target_value = target_value
- -
[docs] def compute_stats_single(self, sample): - return sample
- -
[docs] def process_single(self, sample): - if not (self.field_key and self.target_value): - return True - - field_value = sample - for key in self.field_key.split('.'): - assert key in field_value.keys(), "'{}' not in {}".format( - key, field_value.keys()) - field_value = field_value[key] - - if not (isinstance(field_value, list) - or isinstance(field_value, tuple)): - field_value = [field_value] - for value in field_value: - if value not in self.target_value: - return False - return True
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html b/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html deleted file mode 100644 index 41dd4d16a..000000000 --- a/_modules/data_juicer/ops/filter/specified_numeric_field_filter.html +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - data_juicer.ops.filter.specified_numeric_field_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.specified_numeric_field_filter

-import sys
-
-from ..base_op import OPERATORS, Filter
-
-
-def is_number(s):
-    if s:
-        try:
-            float(s)
-            return True
-        except ValueError:
-            pass
-    return False
-
-
-
[docs]@OPERATORS.register_module('specified_numeric_field_filter') -class SpecifiedNumericFieldFilter(Filter): - """ - Filter based on specified numeric field information. - - If the specified numeric information in the sample is not within the - specified range, the sample will be filtered. - """ - -
[docs] def __init__(self, - field_key: str = '', - min_value: float = -sys.maxsize, - max_value: float = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Filter based on the specified numeric value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param min_value: The min filter value in SpecifiedNumericField - op, samples will be filtered if their specified numeric - field value is below this parameter. - :param max_value: The max filter value in SpecifiedNumericField - op, samples will be filtered if their specified numeric - field value exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.min_value = min_value - self.max_value = max_value
- -
[docs] def compute_stats_single(self, sample): - return sample
- -
[docs] def process_single(self, sample): - if not self.field_key: - return True - - field_value = sample - for key in self.field_key.split('.'): - assert key in field_value.keys(), "'{}' not in {}".format( - key, field_value.keys()) - field_value = field_value[key] - - if is_number(field_value): - field_value = float(field_value) - return self.min_value <= field_value <= self.max_value - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/stopwords_filter.html b/_modules/data_juicer/ops/filter/stopwords_filter.html deleted file mode 100644 index 96382891a..000000000 --- a/_modules/data_juicer/ops/filter/stopwords_filter.html +++ /dev/null @@ -1,231 +0,0 @@ - - - - - - - - data_juicer.ops.filter.stopwords_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.stopwords_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from typing import List
-
-from pydantic import PositiveInt
-
-from data_juicer.utils.asset_utils import ASSET_DIR, load_words_asset
-from data_juicer.utils.constant import Fields, InterVars, StatsKeys
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      words_refinement)
-from ..op_fusion import INTER_WORDS
-
-OP_NAME = 'stopwords_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@INTER_WORDS.register_module(OP_NAME) -class StopWordsFilter(Filter): - """Filter to keep samples with stopword ratio larger than a specific min - value.""" - -
[docs] def __init__(self, - lang: str = 'en', - tokenization: bool = False, - min_ratio: float = 0.3, - stopwords_dir: str = ASSET_DIR, - use_words_aug: bool = False, - words_aug_group_sizes: List[PositiveInt] = [2], - words_aug_join_char: str = '', - *args, - **kwargs): - """ - Initialization method. - - :param lang: Consider stopwords in what language. If lang == - "all", we will adopt the one merged from all the available - languages - :param tokenization: whether to use model to tokenize documents - :param min_ratio: The min filter ratio in this op. - :param stopwords_dir: The directory storing the stopwords - file(s) whose name includes "stopwords" and in json format - :param use_words_aug: Whether to augment words, especially for - Chinese and Vietnamese - :param words_aug_group_sizes: The group size of words to augment - :param words_aug_join_char: The join char between words to - augment - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.lang = lang - self.min_ratio = min_ratio - self.use_words_aug = use_words_aug - self.words_aug_group_sizes = words_aug_group_sizes - self.words_aug_join_char = words_aug_join_char - self.model_key = None - - self.STOPWORDS = load_words_asset(words_dir=stopwords_dir, - words_type='stopwords') - if 'all' not in self.STOPWORDS: - self.STOPWORDS['all'] = [ - val for vals in self.STOPWORDS.values() for val in vals - ] - if tokenization: - self.model_key = prepare_model(model_type='sentencepiece', - lang=lang)
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.stopwords_ratio in sample[Fields.stats]: - return sample - - # try to get words from context - words_key = f'{InterVars.words}-{self.model_key}' - if context and words_key in sample[Fields.context]: - words = sample[Fields.context][words_key] - else: - tokenizer = get_model(self.model_key) - words = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.encode_as_pieces if tokenizer else None) - if context: - sample[Fields.context][words_key] = words - - # try to get refined words from context - refined_words_key = f'{InterVars.refined_words}-True-SPECIAL_CHARS-' \ - f'{self.use_words_aug}-' \ - f'{self.words_aug_group_sizes}-' \ - f'{self.words_aug_join_char}' - if context and refined_words_key in sample[Fields.context]: - words = sample[Fields.context][refined_words_key] - else: - words = words_refinement( - words, - lower_case=True, - strip_chars=SPECIAL_CHARACTERS, - use_words_aug=self.use_words_aug, - words_aug_group_sizes=self.words_aug_group_sizes, - words_aug_join_char=self.words_aug_join_char) - if context: - sample[Fields.context][refined_words_key] = words - - stopwords_ratio = ( - len([word for word in words - if word in self.STOPWORDS[self.lang]]) - / len(words)) \ - if len(words) != 0 else 0.0 - - if stopwords_ratio > 1.0: - stopwords_ratio = 1.0 - - sample[Fields.stats][StatsKeys.stopwords_ratio] = stopwords_ratio - return sample
- -
[docs] def process_single(self, sample): - return sample[Fields.stats][ - StatsKeys.stopwords_ratio] >= self.min_ratio
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/suffix_filter.html b/_modules/data_juicer/ops/filter/suffix_filter.html deleted file mode 100644 index ef6690060..000000000 --- a/_modules/data_juicer/ops/filter/suffix_filter.html +++ /dev/null @@ -1,148 +0,0 @@ - - - - - - - - data_juicer.ops.filter.suffix_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.suffix_filter

-from typing import List, Union
-
-from data_juicer.utils.constant import Fields
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('suffix_filter') -class SuffixFilter(Filter): - """Filter to keep samples with specified suffix.""" - -
[docs] def __init__(self, suffixes: Union[str, List[str]] = [], *args, **kwargs): - """ - Initialization method. - - :param suffixes: the suffix of text that will be keep. - For example: '.txt', 'txt' or ['txt', '.pdf', 'docx'] - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if suffixes is None: - self.suffixes = [] - elif isinstance(suffixes, str): - self.suffixes = [suffixes] - else: - self.suffixes = suffixes
- -
[docs] def compute_stats_single(self, sample): - return sample
- -
[docs] def process_single(self, sample): - if self.suffixes: - if sample[Fields.suffix] in self.suffixes: - return True - else: - return False - else: - return True
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/text_action_filter.html b/_modules/data_juicer/ops/filter/text_action_filter.html deleted file mode 100644 index b2717fcd6..000000000 --- a/_modules/data_juicer/ops/filter/text_action_filter.html +++ /dev/null @@ -1,178 +0,0 @@ - - - - - - - - data_juicer.ops.filter.text_action_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.text_action_filter

-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import AUTOINSTALL
-from data_juicer.utils.mm_utils import remove_special_tokens
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-
-OP_NAME = 'text_action_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class TextActionFilter(Filter): - """ - Filter to keep texts those contain actions in the text. - """ - -
[docs] def __init__(self, - lang: str = 'en', - min_action_num: int = 1, - *args, - **kwargs): - """ - Initialization method. - - :param lang: language of the text in the samples. 'en' for detection of - actions in English and 'zh' for detection of actions in Chinese. - :param mini_action_num: The min action number in the filtering. samples - will be filtered if their action number in the text is below this - parameter. - """ - super().__init__(*args, **kwargs) - # '--no-deps' do not update numpy - AUTOINSTALL.check(['spacy-pkuseg'], '--no-deps') - - if lang not in ['en', 'zh']: - raise ValueError( - f'Language [{lang}] is not supported in action detection.' - f'Can only be one of ["en", "zh"].') - self.lang = lang - self.model_key = prepare_model(model_type='spacy', lang=lang) - self.action_poss = ['VERB'] - self.action_tags = ['VV', 'VB', 'VBP', 'VBZ', 'VBD', 'VBG', 'VBN'] - self.min_action_num = min_action_num
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.num_action in sample[Fields.stats]: - return sample - - text = remove_special_tokens(sample[self.text_key]) - - # process text via spacy and count the actions in text - model = get_model(self.model_key) - doc = model(text) - num_action = 0 - for token in doc: - if token.pos_ in self.action_poss \ - and token.tag_ in self.action_tags: - num_action += 1 - sample[Fields.stats][StatsKeys.num_action] = num_action - - return sample
- -
[docs] def process_single(self, sample): - num_action = sample[Fields.stats][StatsKeys.num_action] - if self.min_action_num <= num_action: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/text_entity_dependency_filter.html b/_modules/data_juicer/ops/filter/text_entity_dependency_filter.html deleted file mode 100644 index 35729999c..000000000 --- a/_modules/data_juicer/ops/filter/text_entity_dependency_filter.html +++ /dev/null @@ -1,215 +0,0 @@ - - - - - - - - data_juicer.ops.filter.text_entity_dependency_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.text_entity_dependency_filter

-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import AUTOINSTALL
-from data_juicer.utils.mm_utils import remove_special_tokens
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-
-OP_NAME = 'text_entity_dependency_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class TextEntityDependencyFilter(Filter): - """ - Identify the entities in the text which are independent with other token, - and filter them. The text containing no entities will be omitted. - """ - -
[docs] def __init__(self, - lang: str = 'en', - min_dependency_num: int = 1, - any_or_all: str = 'all', - *args, - **kwargs): - """ - Initialization method. - - :param lang: language of the text in the samples. 'en' for detection of - entities in English and 'zh' for detection of entities in Chinese. - :param mini_dependency_num: The min token number in the filtering. - Objects is independent if their number of edges in the dependency - tree is below this parameter. - :param any_or_all: keep this sample with 'any' or 'all' strategy. - 'any': keep this sample if any objet is dependent. 'all': keep this - sample only if all images are dependent. - """ - super().__init__(*args, **kwargs) - # '--no-deps' do not update numpy - AUTOINSTALL.check(['spacy-pkuseg'], '--no-deps') - - if lang not in ['en', 'zh']: - raise ValueError( - f'Language [{lang}] is not supported in entities detection.' - f'Can only be one of ["en", "zh"].') - self.lang = lang - self.model_key = prepare_model(model_type='spacy', lang=lang) - self.entity_poss = ['NOUN', 'PROPN', 'PRON'] - self.entity_tags = ['NN', 'NR', 'PN', 'NNS', 'NNP', 'NNPS', 'PRP'] - self.min_dependency_num = min_dependency_num - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.num_dependency_edges in sample[Fields.stats]: - return sample - - text = remove_special_tokens(sample[self.text_key]) - - # identify entities - model = get_model(self.model_key) - doc = model(text) - entity_to_dependency_nums = {} - for token in doc: - if token.pos_ in self.entity_poss \ - and token.tag_ in self.entity_tags: - entity_to_dependency_nums[token] = 0 - - # count the edges of each entity in dependency tree - for obj in entity_to_dependency_nums: - if obj.dep_ != 'ROOT': - entity_to_dependency_nums[obj] += 1 - for token in doc: - # the punctation mark such as ',', '.' - if token.pos_ == 'PUNCT': - continue - - if token.head in entity_to_dependency_nums.keys( - ) and token.dep_ != 'ROOT': - entity_to_dependency_nums[token.head] += 1 - - sample[Fields.stats][StatsKeys.num_dependency_edges] = [ - n for _, n in entity_to_dependency_nums.items() - ] - - return sample
- -
[docs] def process_single(self, sample): - num_dependency_edges = sample[Fields.stats][ - StatsKeys.num_dependency_edges] - keep_bools = np.array([ - self.min_dependency_num <= num_edge - for num_edge in num_dependency_edges - ]) - # omit the samples without entity - if len(keep_bools) <= 0: - return False - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/text_length_filter.html b/_modules/data_juicer/ops/filter/text_length_filter.html deleted file mode 100644 index 514957515..000000000 --- a/_modules/data_juicer/ops/filter/text_length_filter.html +++ /dev/null @@ -1,168 +0,0 @@ - - - - - - - - data_juicer.ops.filter.text_length_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.text_length_filter

-import sys
-
-from data_juicer.utils.constant import Fields, StatsKeys
-
-from ..base_op import OPERATORS, Filter
-
-
-
[docs]@OPERATORS.register_module('text_length_filter') -class TextLengthFilter(Filter): - """Filter to keep samples with total text length within a specific - range.""" - - _batched_op = True - -
[docs] def __init__(self, - min_len: int = 10, - max_len: int = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param min_len: The min text length in the filtering. samples - will be filtered if their text length is below this - parameter. - :param max_len: The max text length in the filtering. samples - will be filtered if their text length exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_len = min_len - self.max_len = max_len
- -
[docs] def compute_stats_batched(self, samples): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - for i, stat in enumerate(samples_stats): - # check if it's computed already - if StatsKeys.text_len in stat: - continue - else: - samples_stats[i][StatsKeys.text_len] = len(samples_list[i]) - - return samples
- -
[docs] def process_batched(self, samples): - if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_len <= stat[StatsKeys.text_len] <= self. - max_len, samples[Fields.stats]) - else: - # single sample for ray filter - if self.min_len <= samples[Fields.stats][ - StatsKeys.text_len] <= self.max_len: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/token_num_filter.html b/_modules/data_juicer/ops/filter/token_num_filter.html deleted file mode 100644 index c5edb383c..000000000 --- a/_modules/data_juicer/ops/filter/token_num_filter.html +++ /dev/null @@ -1,171 +0,0 @@ - - - - - - - - data_juicer.ops.filter.token_num_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.token_num_filter

-import sys
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..common import get_words_from_document
-
-OP_NAME = 'token_num_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class TokenNumFilter(Filter): - """Filter to keep samples with total token number within a specific - range.""" - -
[docs] def __init__(self, - hf_tokenizer: str = 'EleutherAI/pythia-6.9b-deduped', - min_num: int = 10, - max_num: int = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param hf_tokenizer: the tokenizer name of Hugging Face tokenizers. - :param min_num: The min filter token number in this op, samples - will be filtered if their token number is below this - parameter. - :param max_num: The max filter token number in this op, samples - will be filtered if their token number exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_num = min_num - self.max_num = max_num - self.hf_tokenizer = hf_tokenizer - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_tokenizer, - return_model=False)
- -
[docs] def compute_stats_single(self, sample): - # check if it's computed already - if StatsKeys.num_token in sample[Fields.stats]: - return sample - - tokenizer = get_model(self.model_key) - tokens = get_words_from_document( - sample[self.text_key], - token_func=tokenizer.tokenize if tokenizer else None) - sample[Fields.stats][StatsKeys.num_token] = len(tokens) - return sample
- -
[docs] def process_single(self, sample): - if self.min_num <= sample[Fields.stats][ - StatsKeys.num_token] <= self.max_num: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html b/_modules/data_juicer/ops/filter/video_aesthetics_filter.html deleted file mode 100644 index 9c8088686..000000000 --- a/_modules/data_juicer/ops/filter/video_aesthetics_filter.html +++ /dev/null @@ -1,312 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_aesthetics_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_aesthetics_filter

-import numpy as np
-from loguru import logger
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
-                                        extract_video_frames_uniformly,
-                                        load_data_with_context, load_video)
-
-from ...utils.model_utils import get_model, prepare_model
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS
-
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'video_aesthetics_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -@INTER_SAMPLED_FRAMES.register_module(OP_NAME) -class VideoAestheticsFilter(Filter): - """Filter to keep data samples with aesthetics scores for specified frames - in the videos within a specific range. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_scorer_model: str = '', - trust_remote_code: bool = False, - min_score: float = 0.4, - max_score: float = 1.0, - frame_sampling_method: str = 'uniform', - frame_num: PositiveInt = 3, - any_or_all: str = 'any', - reduce_mode: str = 'avg', - *args, - **kwargs): - """ - Initialization method. - - :param hf_scorer_model: Huggingface model name for the aesthetics - predictor. By default, we will use - 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE', - refer to pypi.org/project/simple-aesthetics-predictor - :param min_score: Min score for the predicted aesthetics in a video. - :param max_score: Max score for the predicted aesthetics in a video. - :param frame_sampling_method: sampling method of extracting frame - images from the videos. - Should be one of ["all_keyframes", "uniform"]. - The former one extracts all key frames and the latter one extract - specified number of frames uniformly from the video. - Default: "uniform" with frame_num=3, considering that the number of - keyframes can be large while their difference is usually small - in terms of their aesthetics. - :param frame_num: the number of frames to be extracted uniformly from - the video. Only works when frame_sampling_method is "uniform". If - it's 1, only the middle frame will be extracted. If it's 2, only - the first and the last frames will be extracted. If it's larger - than 2, in addition to the first and the last frames, other frames - will be extracted uniformly within the video duration. - :param any_or_all: Keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param reduce_mode: reduce mode when one sample corresponds to - multiple frames, must be one of ['avg','max', 'min']. - 'avg': Take the average of multiple values - 'max': Take the max of multiple values - 'min': Take the min of multiple values - :param args: Extra positional arguments. - :param kwargs: Extra keyword arguments. - """ - - super().__init__(*args, **kwargs) - if hf_scorer_model == '': - hf_scorer_model = \ - 'shunk031/aesthetics-predictor-v2-sac-logos-ava1-l14-linearMSE' - self.min_score = min_score - self.max_score = max_score - - if frame_sampling_method not in ['all_keyframes', 'uniform']: - raise ValueError( - f'Frame sampling method ' - f'[{frame_sampling_method}] is not supported. ' - f'Can only be one of ["all_keyframes", "uniform"].') - - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - if reduce_mode not in ['avg', 'max', 'min']: - raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. ' - f'Can only be one of ["avg", "max", "min"].') - self.any = (any_or_all == 'any') - self.reduce_mode = reduce_mode - - self.model_key = prepare_model( - model_type='simple_aesthetics', - pretrained_model_name_or_path=hf_scorer_model, - trust_remote_code=trust_remote_code) - # the original score predicted by laion-ai's scorer is within [0, 10] - self.need_normalized_by_ten = ('shunk031/aesthetics-predictor' - in hf_scorer_model) - self.frame_sampling_method = frame_sampling_method - self.frame_num = frame_num - - self.sampled_frames_key_suffix = f'-{frame_sampling_method}' + \ - ('' if frame_sampling_method == 'all_keyframes' - else f'-{frame_num}')
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.video_frames_aesthetics_score in sample[Fields.stats]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][StatsKeys.video_frames_aesthetics_score] = ( - np.array([], dtype=np.float64)) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - aesthetics_scores = [] - for key, video in videos.items(): - sampled_frames_key = key + self.sampled_frames_key_suffix - if video is None: - continue - elif context and sampled_frames_key in sample[Fields.context]: - # sampled frames can be found in the context - frames = sample[Fields.context][sampled_frames_key] - else: - # extract frame images - if self.frame_sampling_method == 'all_keyframes': - frames = extract_key_frames(video) - elif self.frame_sampling_method == 'uniform': - frames = extract_video_frames_uniformly( - video, self.frame_num) - else: - frames = [] - - # store the sampled frames in the context - if context: - sample[Fields.context][sampled_frames_key] = frames - frame_images = [frame.to_image() for frame in frames] - - if len(frame_images) > 0: - # compute aesthetics_scores - model, processor = get_model(self.model_key, - rank=rank, - use_cuda=self.use_cuda()) - inputs = processor(images=frame_images, - return_tensors='pt').to(model.device) - with torch.no_grad(): - outputs = model(**inputs) - if self.need_normalized_by_ten: - aesthetics_score = outputs.logits / 10.0 - else: - aesthetics_score = outputs.logits - - if self.reduce_mode == 'avg': - aesthetics_score = float(aesthetics_score.mean()) - elif self.reduce_mode == 'max': - aesthetics_score = float(aesthetics_score.max()) - else: - aesthetics_score = float(aesthetics_score.min()) - else: - aesthetics_score = 0.0 - - aesthetics_scores.append(aesthetics_score) - - logger.debug(f'aesthetics_score: {aesthetics_scores}') - - sample[Fields.stats][StatsKeys.video_frames_aesthetics_score] = ( - aesthetics_scores) - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - - return sample
- -
[docs] def process_single(self, sample): - aesthetics_scores = ( - sample)[Fields.stats][StatsKeys.video_frames_aesthetics_score] - if len(aesthetics_scores) <= 0: - return True - - keep_bools = np.array([ - self.min_score <= aesthetics_score <= self.max_score - for aesthetics_score in aesthetics_scores - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html b/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html deleted file mode 100644 index c44c3e60b..000000000 --- a/_modules/data_juicer/ops/filter/video_aspect_ratio_filter.html +++ /dev/null @@ -1,203 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_aspect_ratio_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_aspect_ratio_filter

-from fractions import Fraction
-
-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
-                                        load_video)
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_VIDEOS
-
-
-
[docs]@OPERATORS.register_module('video_aspect_ratio_filter') -@LOADED_VIDEOS.register_module('video_aspect_ratio_filter') -class VideoAspectRatioFilter(Filter): - """Filter to keep samples with video aspect ratio within a specific range. - AspectRatio = W / H. - """ - -
[docs] def __init__(self, - min_ratio: str = '9/21', - max_ratio: str = '21/9', - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_ratio: The minimum aspect ratio to keep samples, - supported format is a string, such as "9:21" or "9/21". - :param max_ratio: The maximum aspect ratio to keep samples, - supported format is a string, such as "21:9" or "21/9". - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_ratio = Fraction(str(min_ratio).replace(':', '/')) - self.max_ratio = Fraction(str(max_ratio).replace(':', '/')) - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.video_aspect_ratios in sample[Fields.stats]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][StatsKeys.video_aspect_ratios] = np.array( - [], dtype=np.float64) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - # compute aspect ratios for each video with W/H - video_aspect_ratios = {} - for key, video in videos.items(): - stream = video.streams.video[0] - video_aspect_ratios[ - key] = stream.codec_context.width / stream.codec_context.height - if not context: - close_video(video) - - sample[Fields.stats][StatsKeys.video_aspect_ratios] = [ - video_aspect_ratios[key] for key in loaded_video_keys - ] - - return sample
- -
[docs] def process_single(self, sample): - video_aspect_ratios = sample[Fields.stats][ - StatsKeys.video_aspect_ratios] - - keep_bools = np.array([ - self.min_ratio <= Fraction(aspect_ratio) <= self.max_ratio - for aspect_ratio in video_aspect_ratios - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_duration_filter.html b/_modules/data_juicer/ops/filter/video_duration_filter.html deleted file mode 100644 index bff3aa717..000000000 --- a/_modules/data_juicer/ops/filter/video_duration_filter.html +++ /dev/null @@ -1,202 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_duration_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_duration_filter

-import sys
-
-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
-                                        load_video)
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_VIDEOS
-
-OP_NAME = 'video_duration_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoDurationFilter(Filter): - """Keep data samples whose videos' durations are within a specified range. - """ - -
[docs] def __init__(self, - min_duration: float = 0, - max_duration: float = sys.maxsize, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_duration: The min video duration to keep samples in seconds. - It's 0 by default. - :param max_duration: The max video duration to keep samples in seconds. - It's sys.maxsize by default. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_duration = min_duration - self.max_duration = max_duration - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.video_duration in sample[Fields.stats]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][StatsKeys.video_duration] = np.array( - [], dtype=np.float64) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - video_durations = {} - for video_key, video in videos.items(): - stream = video.streams.video[0] - video_durations[video_key] = round(stream.duration * - stream.time_base) - if not context: - close_video(video) - - # get video durations - sample[Fields.stats][StatsKeys.video_duration] = [ - video_durations[video_key] for video_key in sample[self.video_key] - ] - - return sample
- -
[docs] def process_single(self, sample): - video_durations = sample[Fields.stats][StatsKeys.video_duration] - keep_bools = np.array([ - self.min_duration <= duration <= self.max_duration - for duration in video_durations - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html b/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html deleted file mode 100644 index 1a231951b..000000000 --- a/_modules/data_juicer/ops/filter/video_frames_text_similarity_filter.html +++ /dev/null @@ -1,323 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_frames_text_similarity_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_frames_text_similarity_filter

-import numpy as np
-from PIL import ImageOps
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
-                                        extract_key_frames,
-                                        extract_video_frames_uniformly,
-                                        load_data_with_context, load_video,
-                                        remove_special_tokens)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS
-
-OP_NAME = 'video_frames_text_similarity_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -@INTER_SAMPLED_FRAMES.register_module(OP_NAME) -class VideoFramesTextSimilarityFilter(Filter): - """Filter to keep samples those similarities between sampled video frame - images and text within a specific range.""" - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_clip='openai/clip-vit-base-patch32', - trust_remote_code=False, - min_score: float = 0.1, - max_score: float = 1.0, - frame_sampling_method: str = 'all_keyframes', - frame_num: PositiveInt = 3, - horizontal_flip: bool = False, - vertical_flip: bool = False, - any_or_all: str = 'any', - reduce_mode: str = 'avg', - *args, - **kwargs): - """ - Initialization method. - - :param hf_clip: clip model name on huggingface to compute - the similarity between frame image and text. It's kind of - language-related. For example, for Chinese datasets, ChineseCLIP - might be a better choice. - :param min_score: the min similarity to keep samples. - :param max_score: the max similarity to keep samples. - :param frame_sampling_method: sampling method of extracting frame - images from the videos. - Should be one of ["all_keyframes", "uniform"]. - The former one extracts all key frames (the number of which depends - on the duration of the video) and the latter one extract specified - number of frames uniformly from the video. - Default: "all_keyframes". - :param frame_num: the number of frames to be extracted uniformly from - the video. Only works when frame_sampling_method is "uniform". If - it's 1, only the middle frame will be extracted. If it's 2, only - the first and the last frames will be extracted. If it's larger - than 2, in addition to the first and the last frames, other frames - will be extracted uniformly within the video duration. - :param horizontal_flip: flip frame image horizontally (left to right). - :param vertical_flip: flip frame image vertically (top to bottom). - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param reduce_mode: reduce mode when one text corresponds to - multiple video frame images in a chunk. - 'avg': Take the average of multiple values - 'max': Take the max of multiple values - 'min': Take the min of multiple values - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_score = min_score - self.max_score = max_score - if frame_sampling_method not in ['all_keyframes', 'uniform']: - raise ValueError( - f'Frame sampling method ' - f'[{frame_sampling_method}] is not supported. ' - f'Can only be one of ["all_keyframes", "uniform"].') - if reduce_mode not in ['avg', 'max', 'min']: - raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. ' - f'Can only be one of ["avg", "max", "min"].') - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model(model_type='huggingface', - pretrained_model_name_or_path=hf_clip, - trust_remote_code=trust_remote_code) - self.reduce_mode = reduce_mode - self.horizontal_flip = horizontal_flip - self.vertical_flip = vertical_flip - self.frame_sampling_method = frame_sampling_method - self.frame_num = frame_num - - self.sampled_frames_key_suffix = f'-{frame_sampling_method}' + \ - ('' if frame_sampling_method == 'all_keyframes' - else f'-{frame_num}')
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.video_frames_text_similarity in sample[Fields.stats]: - return sample - - # there is no videos in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][ - StatsKeys.video_frames_text_similarity] = np.array( - [], dtype=np.float64) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - text = sample[self.text_key] - offset = 0 - similarity = [] - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - for chunk in text.split(SpecialTokens.eoc): - count = chunk.count(SpecialTokens.video) - - # no video or no text - if count == 0 or len(chunk) == 0: - continue - else: - text_chunk = remove_special_tokens(chunk) - video_frame_images_chunk = [] - for video_key in loaded_video_keys[offset:offset + count]: - video = videos[video_key] - sampled_frames_key = video_key + \ - self.sampled_frames_key_suffix - - # extract frame images - if context and sampled_frames_key in sample[ - Fields.context]: - # context hit - frames = sample[Fields.context][sampled_frames_key] - else: - if self.frame_sampling_method == 'all_keyframes': - frames = extract_key_frames(video) - elif self.frame_sampling_method == 'uniform': - frames = extract_video_frames_uniformly( - video, self.frame_num) - else: - frames = [] - - # store the sampled frames in the context - if context: - sample[Fields.context][sampled_frames_key] = frames - - frame_images = [frame.to_image() for frame in frames] - for image in frame_images: - if self.horizontal_flip: - image = ImageOps.mirror(image) - if self.vertical_flip: - image = ImageOps.flip(image) - video_frame_images_chunk.append(image) - - if len(video_frame_images_chunk) > 0: - inputs = processor(text=text_chunk, - images=video_frame_images_chunk, - return_tensors='pt', - truncation=True, - max_length=model.config.text_config. - max_position_embeddings, - padding=True).to(model.device) - - outputs = model(**inputs) - chunk_logits = outputs.logits_per_text / 100.0 - - if self.reduce_mode == 'avg': - chunk_similarity = chunk_logits.mean() - elif self.reduce_mode == 'max': - chunk_similarity = chunk_logits.max() - else: - chunk_similarity = chunk_logits.min() - else: - chunk_similarity = 0.0 - - similarity.append(float(chunk_similarity)) - offset += count - sample[Fields.stats][ - StatsKeys.video_frames_text_similarity] = similarity - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - - return sample
- -
[docs] def process_single(self, sample, rank=None): - similarity = sample[Fields.stats][ - StatsKeys.video_frames_text_similarity] - if len(similarity) <= 0: - return True - - keep_bools = np.array([ - self.min_score <= sim_value <= self.max_score - for sim_value in similarity - ]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_motion_score_filter.html b/_modules/data_juicer/ops/filter/video_motion_score_filter.html deleted file mode 100644 index 8a16e06a7..000000000 --- a/_modules/data_juicer/ops/filter/video_motion_score_filter.html +++ /dev/null @@ -1,332 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_motion_score_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_motion_score_filter

-import sys
-from contextlib import contextmanager
-from typing import Optional, Tuple, Union
-
-import numpy as np
-from pydantic import PositiveFloat, PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-
-from ..base_op import OPERATORS, UNFORKABLE, Filter
-
-cv2 = LazyLoader('cv2', 'cv2')
-
-OP_NAME = 'video_motion_score_filter'
-
-
-@contextmanager
-def VideoCapture(*args, **kwargs):
-    cap = cv2.VideoCapture(*args, **kwargs)
-    try:
-        yield cap
-    finally:
-        cap.release()
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class VideoMotionScoreFilter(Filter): - """Filter to keep samples with video motion scores within a specific range. The - Farneback's algorith from OpenCV is used to compute dense optical flow. - """ - - _default_kwargs = { - 'pyr_scale': 0.5, - 'levels': 3, - 'winsize': 15, - 'iterations': 3, - 'poly_n': 5, - 'poly_sigma': 1.2, - 'flags': 0 - } - -
[docs] def __init__(self, - min_score: float = 0.25, - max_score: float = sys.float_info.max, - sampling_fps: PositiveFloat = 2, - size: Union[PositiveInt, Tuple[PositiveInt], - Tuple[PositiveInt, PositiveInt], None] = None, - max_size: Optional[PositiveInt] = None, - relative: bool = False, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_score: The minimum motion score to keep samples. - :param max_score: The maximum motion score to keep samples. - :param sampling_fps: The sampling rate in frames_per_second for - optical flow calculations. - :param size: Resize frames before computing optical flow. If size is a - sequence like (h, w), frame size will be matched to this. If size - is an int, smaller edge of frames will be matched to this number. - i.e, if height > width, then frame will be rescaled to (size * - height / width, size). Default `None` to keep the original size. - :param max_size: The maximum allowed for the longer edge of resized - frames. If the longer edge of frames is greater than max_size after - being resized according to size, size will be overruled so that the - longer edge is equal to max_size. As a result, the smaller edge may - be shorter than size. This is only supported if size is an int. - :param relative: If `True`, the optical flow magnitude is normalized to - a [0, 1] range, relative to the frame's diagonal length. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_score = min_score - self.max_score = max_score - self.sampling_fps = sampling_fps - - if isinstance(size, (list, tuple)): - if len(size) not in [1, 2]: - raise ValueError( - f'Size must be an int or a 1 or 2 element tuple/list,' - f'not a {len(size)} element tuple/list.') - if isinstance(size, int): - size = (size, ) - self.size = size - self.max_size = max_size - self.relative = relative - - self.extra_kwargs = self._default_kwargs - for key in kwargs: - if key in self.extra_kwargs: - self.extra_kwargs[key] = kwargs[key] - - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.video_motion_score in sample[Fields.stats]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][StatsKeys.video_motion_score] = np.array( - [], dtype=np.float64) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - unique_motion_scores = {} - for video_key in loaded_video_keys: - # skip duplicate videos - if video_key in unique_motion_scores: - continue - - video_motion_scores = [] - with VideoCapture(video_key) as cap: - if cap.isOpened(): - fps = cap.get(cv2.CAP_PROP_FPS) - sampling_fps = min(self.sampling_fps, fps) - sampling_step = round(fps / sampling_fps) - total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - # at least two frames for computing optical flow - sampling_step = max(min(sampling_step, total_frames - 1), - 1) - - prev_frame = None - frame_count = 0 - while cap.isOpened(): - ret, frame = cap.read() - if not ret: - # If the frame can't be read, it could be due to - # a corrupt frame or reaching the end of the video. - break - - height, width, _ = frame.shape - new_size = _compute_resized_output_size( - (height, width), self.size, self.max_size) - if new_size != (height, width): - frame = cv2.resize(frame, - new_size, - interpolation=cv2.INTER_AREA) - - gray_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY) - if prev_frame is None: - prev_frame = gray_frame - continue - - flow = cv2.calcOpticalFlowFarneback( - prev_frame, gray_frame, None, **self.extra_kwargs) - mag, _ = cv2.cartToPolar(flow[..., 0], flow[..., 1]) - frame_motion_score = np.mean(mag) - if self.relative: - frame_motion_score /= np.hypot(*flow.shape[:2]) - video_motion_scores.append(frame_motion_score) - prev_frame = gray_frame - - # quickly skip frames - frame_count += sampling_step - cap.set(cv2.CAP_PROP_POS_FRAMES, frame_count) - - # may due to frame corruption - if not video_motion_scores: - unique_motion_scores[video_key] = -1 - else: - unique_motion_scores[video_key] = np.mean(video_motion_scores - or [-1]) - - sample[Fields.stats][StatsKeys.video_motion_score] = [ - unique_motion_scores[key] for key in loaded_video_keys - ] - return sample
- -
[docs] def process_single(self, sample): - video_motion_scores = sample[Fields.stats][ - StatsKeys.video_motion_score] - - keep_bools = np.array([ - self.min_score <= motion_score <= self.max_score - for motion_score in video_motion_scores - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
- - -def _compute_resized_output_size( - frame_size: Tuple[int, int], - size: Union[Tuple[PositiveInt], Tuple[PositiveInt, PositiveInt]], - max_size: Optional[int] = None, -) -> Tuple[int, int]: - h, w = frame_size - short, long = (w, h) if w <= h else (h, w) - - if size is None: # no change - new_short, new_long = short, long - elif len(size) == 1: # specified size only for the smallest edge - new_short = size[0] - new_long = int(new_short * long / short) - else: # specified both h and w - new_short, new_long = min(size), max(size) - - if max_size is not None and new_long > max_size: - new_short = int(max_size * new_short / new_long) - new_long = max_size - - new_w, new_h = (new_short, new_long) if w <= h else (new_long, new_short) - return new_h, new_w -
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_nsfw_filter.html b/_modules/data_juicer/ops/filter/video_nsfw_filter.html deleted file mode 100644 index 2b12900ae..000000000 --- a/_modules/data_juicer/ops/filter/video_nsfw_filter.html +++ /dev/null @@ -1,285 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_nsfw_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_nsfw_filter

-import numpy as np
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
-                                        extract_video_frames_uniformly,
-                                        load_data_with_context, load_video)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS
-
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'video_nsfw_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -@INTER_SAMPLED_FRAMES.register_module(OP_NAME) -class VideoNSFWFilter(Filter): - """Filter to keep samples whose videos have low nsfw scores.""" - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_nsfw_model: str = 'Falconsai/nsfw_image_detection', - trust_remote_code: bool = False, - score_threshold: float = 0.5, - frame_sampling_method: str = 'all_keyframes', - frame_num: PositiveInt = 3, - reduce_mode: str = 'avg', - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param hf_nsfw_model: nsfw detection model name on huggingface. - :param score_threshold: the nsfw score threshold for samples. - range from 0 to 1. Samples with nsfw score less than this threshold - will be kept. - :param frame_sampling_method: sampling method of extracting frame - images from the videos. - Should be one of ["all_keyframes", "uniform"]. - The former one extracts all key frames (the number of which depends - on the duration of the video) and the latter one extract specified - number of frames uniformly from the video. - Default: "all_keyframes". - :param frame_num: the number of frames to be extracted uniformly from - the video. Only works when frame_sampling_method is "uniform". If - it's 1, only the middle frame will be extracted. If it's 2, only - the first and the last frames will be extracted. If it's larger - than 2, in addition to the first and the last frames, other frames - will be extracted uniformly within the video duration. - :param reduce_mode: reduce mode for multiple sampled video frames. - 'avg': Take the average of multiple values - 'max': Take the max of multiple values - 'min': Take the min of multiple values - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.score_threshold = score_threshold - if frame_sampling_method not in ['all_keyframes', 'uniform']: - raise ValueError( - f'Frame sampling method ' - f'[{frame_sampling_method}] is not supported. ' - f'Can only be one of ["all_keyframes", "uniform"].') - if reduce_mode not in ['avg', 'max', 'min']: - raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. ' - f'Can only be one of ["avg", "max", "min"].') - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_nsfw_model, - trust_remote_code=trust_remote_code) - self.reduce_mode = reduce_mode - self.frame_sampling_method = frame_sampling_method - self.frame_num = frame_num - - self.sampled_frames_key_suffix = f'-{frame_sampling_method}' + \ - ('' if frame_sampling_method == 'all_keyframes' - else f'-{frame_num}')
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.video_nsfw_score in sample[Fields.stats]: - return sample - - # there is no videos in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][StatsKeys.video_nsfw_score] = np.array( - [], dtype=np.float64) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - nsfw_scores = [] - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - for video_key, video in videos.items(): - sampled_frames_key = video_key + self.sampled_frames_key_suffix - - # extract frame images - if context and sampled_frames_key in sample[Fields.context]: - # context hit - frames = sample[Fields.context][sampled_frames_key] - else: - if self.frame_sampling_method == 'all_keyframes': - frames = extract_key_frames(video) - elif self.frame_sampling_method == 'uniform': - frames = extract_video_frames_uniformly( - video, self.frame_num) - else: - frames = [] - - # store the sampled frames in the context - if context: - sample[Fields.context][sampled_frames_key] = frames - - frame_images = [frame.to_image() for frame in frames] - - if len(frame_images) > 0: - inputs = processor(images=frame_images, return_tensors='pt') - inputs = inputs.to(model.device) - outputs = model(**inputs) - logits = outputs.logits - cur_scores = [ - scores[1] for scores in torch.softmax(logits, dim=-1) - ] - cur_scores = torch.Tensor(cur_scores) - - if self.reduce_mode == 'avg': - cur_score = cur_scores.mean() - elif self.reduce_mode == 'max': - cur_score = cur_scores.max() - else: - cur_score = cur_scores.min() - else: - cur_score = 0.0 - - nsfw_scores.append(float(cur_score)) - - sample[Fields.stats][StatsKeys.video_nsfw_score] = nsfw_scores - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - - return sample
- -
[docs] def process_single(self, sample, rank=None): - itm_scores = sample[Fields.stats][StatsKeys.video_nsfw_score] - if len(itm_scores) <= 0: - return True - - keep_bools = np.array( - [itm_score < self.score_threshold for itm_score in itm_scores]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html b/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html deleted file mode 100644 index b253a486d..000000000 --- a/_modules/data_juicer/ops/filter/video_ocr_area_ratio_filter.html +++ /dev/null @@ -1,307 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_ocr_area_ratio_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_ocr_area_ratio_filter

-from typing import List, Union
-
-import numpy as np
-from pydantic import PositiveInt
-
-from data_juicer import cuda_device_count
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (close_video,
-                                        extract_video_frames_uniformly,
-                                        load_data_with_context, load_video)
-
-from ..base_op import OPERATORS, UNFORKABLE, Filter
-from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS
-
-easyocr = LazyLoader('easyocr', 'easyocr')
-
-OP_NAME = 'video_ocr_area_ratio_filter'
-
-
-def triangle_area(p1, p2, p3):
-    """
-    Compute the triangle area according to its coordinates.
-    """
-    x1, y1 = p1
-    x2, y2 = p2
-    x3, y3 = p3
-    tri_area = 0.5 * np.abs(x1 * y2 + x2 * y3 + x3 * y1 - x2 * y1 - x3 * y2 -
-                            x1 * y3)
-    return tri_area
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -@INTER_SAMPLED_FRAMES.register_module(OP_NAME) -class VideoOcrAreaRatioFilter(Filter): - """Keep data samples whose detected text area ratios for specified frames - in the video are within a specified range. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - min_area_ratio: float = 0, - max_area_ratio: float = 1.0, - frame_sample_num: PositiveInt = 3, - languages_to_detect: Union[str, List[str]] = ['ch_sim', 'en'], - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_area_ratio: The min ocr area ratio to keep samples. It's 0 - by default. - :param max_area_ratio: The max ocr area ratio to keep samples. It's 1.0 - by default. - :param frame_sample_num: The number of sampled frames to calculate the - ocr area ratio. If it's 1, only middle frame will be selected. If - it's 2, only the first and the last frames will be selected. If - it's larger than 2, in addition to the first and the last frames, - other frames will be sampled evenly within the video duration. - :param languages_to_detect: texts in which languages should be - detected. Default: ['ch_sim', 'en']. Full language list can be - found here: https://www.jaided.ai/easyocr/. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_area_ratio = min_area_ratio - self.max_area_ratio = max_area_ratio - self.frame_sample_num = frame_sample_num - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - # initialize easyocr reader - if isinstance(languages_to_detect, str): - languages_to_detect = [languages_to_detect] - self.reader = easyocr.Reader( - lang_list=languages_to_detect, - recognizer=False, - verbose=False, - gpu=False, - ) - - # only uniformly sampling method is supported in this OP - self.sampled_frames_key_suffix = f'-uniform-{frame_sample_num}'
- -
[docs] def get_reader(self, rank): - if self.use_cuda(): - rank = 0 if rank is None else rank - device = f'cuda:{rank % cuda_device_count()}' - self.reader.detector = self.reader.detector.to(device) - self.reader.device = device - return self.reader
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.video_ocr_area_ratio in sample[Fields.stats]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][StatsKeys.video_ocr_area_ratio] = np.array( - [], dtype=np.float64) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - reader = self.get_reader(rank) - # compute ocr area ratios - video_ocr_area_ratios = {} - for video_key, container in videos.items(): - sampled_frames_key = video_key + self.sampled_frames_key_suffix - if context and sampled_frames_key in sample[Fields.context]: - sampled_frames = sample[Fields.context][sampled_frames_key] - else: - sampled_frames = extract_video_frames_uniformly( - container, self.frame_sample_num) - # store the sampled frames in the context - if context: - sample[Fields.context][sampled_frames_key] = sampled_frames - images = [f.to_image() for f in sampled_frames] - # collect ocr results for each image - frame_ocr_area_ratios = [] - for idx, image in enumerate(images): - # return horizontal detected results and free-form detected - # results - horizontal_list, free_list = reader.detect(np.asarray(image)) - total_area = image.width * image.height - # rectangles - rect_area = 0 - for xmin, xmax, ymin, ymax in horizontal_list[0]: - if xmax < xmin or ymax < ymin: - continue - rect_area += (xmax - xmin) * (ymax - ymin) - # free-form - quad_area = 0 - for points in free_list[0]: - triangle1 = points[:3] - quad_area += triangle_area(*triangle1) - triangle2 = points[2:] + [points[0]] - quad_area += triangle_area(*triangle2) - text_area = rect_area + quad_area - frame_ocr_area_ratios.append(text_area / total_area) - - # for debug - # if False: - # from PIL import ImageDraw - # draw = ImageDraw.Draw(image) - # for xmin, xmax, ymin, ymax in horizontal_list[0]: - # if xmax < xmin or ymax < ymin: - # continue - # draw.rectangle((xmin, ymin, xmax, ymax), - # outline='red', - # width=1) - # for points in free_list[0]: - # points = [(int(item[0]), int(item[1])) - # for item in points] - # draw.polygon(points, outline='blue', width=1) - # image.save(f'{video_key}-{idx}.jpg') - video_ocr_area_ratios[video_key] = np.mean(frame_ocr_area_ratios) - - if not context: - close_video(container) - - # get video durations - sample[Fields.stats][StatsKeys.video_ocr_area_ratio] = [ - video_ocr_area_ratios[video_key] - for video_key in sample[self.video_key] - ] - - return sample
- -
[docs] def process_single(self, sample): - video_ocr_area_ratios = sample[Fields.stats][ - StatsKeys.video_ocr_area_ratio] - keep_bools = np.array([ - self.min_area_ratio <= ocr_area_ratio <= self.max_area_ratio - for ocr_area_ratio in video_ocr_area_ratios - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_resolution_filter.html b/_modules/data_juicer/ops/filter/video_resolution_filter.html deleted file mode 100644 index eb42698b4..000000000 --- a/_modules/data_juicer/ops/filter/video_resolution_filter.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_resolution_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_resolution_filter

-import sys
-
-import numpy as np
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.mm_utils import (close_video, load_data_with_context,
-                                        load_video)
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import LOADED_VIDEOS
-
-OP_NAME = 'video_resolution_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoResolutionFilter(Filter): - """Keep data samples whose videos' resolutions are within a specified range. - """ - -
[docs] def __init__(self, - min_width: int = 1, - max_width: int = sys.maxsize, - min_height: int = 1, - max_height: int = sys.maxsize, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param min_width: The min horizontal resolution. - :param max_width: The max horizontal resolution. - :param min_height: The min vertical resolution. - :param max_height: The max vertical resolution. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_width = min_width - self.max_width = max_width - self.min_height = min_height - self.max_height = max_height - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any')
- -
[docs] def compute_stats_single(self, sample, context=False): - # check if it's computed already - if StatsKeys.video_width in sample[Fields.stats] \ - and StatsKeys.video_height in sample[Fields.stats]: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][StatsKeys.video_width] = np.array( - [], dtype=np.int64) - sample[Fields.stats][StatsKeys.video_height] = np.array( - [], dtype=np.int64) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - video_width, video_height = dict(), dict() - for video_key, video in videos.items(): - # default to load the first stream - video_stream = video.streams.video[0] - - # fail in loading video - if video_stream is None: - return sample - - video_width[video_key] = video_stream.codec_context.width - video_height[video_key] = video_stream.codec_context.height - - # get video resolutions - sample[Fields.stats][StatsKeys.video_width] = [ - video_width[video_key] for video_key in sample[self.video_key] - ] - sample[Fields.stats][StatsKeys.video_height] = [ - video_height[video_key] for video_key in sample[self.video_key] - ] - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - - return sample
- -
[docs] def process_single(self, sample): - ws = sample[Fields.stats][StatsKeys.video_width] - hs = sample[Fields.stats][StatsKeys.video_height] - keep_bools = np.array([ - self.min_width <= w <= self.max_width - and self.min_height <= h <= self.max_height - for w, h in zip(ws, hs) - ]) - if len(keep_bools) <= 0: - return True - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_tagging_from_frames_filter.html b/_modules/data_juicer/ops/filter/video_tagging_from_frames_filter.html deleted file mode 100644 index 304c83d88..000000000 --- a/_modules/data_juicer/ops/filter/video_tagging_from_frames_filter.html +++ /dev/null @@ -1,219 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_tagging_from_frames_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_tagging_from_frames_filter

-from typing import List
-
-import numpy as np
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields
-
-from ..base_op import OPERATORS, UNFORKABLE, Filter
-from ..mapper.video_tagging_from_frames_mapper import \
-    VideoTaggingFromFramesMapper
-from ..op_fusion import LOADED_VIDEOS
-
-OP_NAME = 'video_tagging_from_frames_filter'
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoTaggingFromFramesFilter(Filter): - """Filter to keep samples whose videos contain the given tags. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - tags: List[str] = ['people'], - contain: str = 'any', - frame_sampling_method: str = 'all_keyframes', - frame_num: PositiveInt = 3, - tag_field_name: str = Fields.video_frame_tags, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param tags: a tag list to shift the videos, total tags can be found - in https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt # noqa: E501 - :param contain: require the videos containing 'any' or 'all' tags. - When tags equal to [], 'all' keeps all samples, 'any' keeps no - sample. - :param frame_sampling_method: sampling method of extracting frame - images from the videos. Should be one of - ["all_keyframes", "uniform"]. - The former one extracts all key frames (the number of which depends - on the duration of the video) and the latter one extract specified - number of frames uniformly from the video. - Default: "all_keyframes". - :param frame_num: the number of frames to be extracted uniformly from - the video. Only works when frame_sampling_method is "uniform". If - it's 1, only the middle frame will be extracted. If it's 2, only - the first and the last frames will be extracted. If it's larger - than 2, in addition to the first and the last frames, other frames - will be extracted uniformly within the video duration. - :param tag_field_name: the field name to store the tags. It's - "__dj__video_frame_tags__" in default. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if contain not in ['any', 'all']: - raise ValueError(f'the containing type [{contain}] is not ' - f'supported. Can only be one of ["any", "all"].') - if frame_sampling_method not in ['all_keyframes', 'uniform']: - raise ValueError( - f'Frame sampling method [{frame_sampling_method}] is not ' - f'supported. Can only be one of ["all_keyframes", "uniform"].') - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.tags = set([tag.lower() for tag in tags]) - self.contain_any = (contain == 'any') - self.any = (any_or_all == 'any') - self.tag_field_name = tag_field_name - self.tagging_producer = VideoTaggingFromFramesMapper( - frame_sampling_method=frame_sampling_method, - frame_num=frame_num, - accelerator=self.accelerator, - tag_field_name=self.tag_field_name, - )
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - - sample = self.tagging_producer.process(sample, rank, context) - - return sample
- -
[docs] def process_single(self, sample, rank=None): - video_tags = sample[self.tag_field_name] - if len(video_tags) <= 0: - return True - - keep_bools = [] - for words in video_tags: - words = set([w.lower() for w in words]) - if self.contain_any: - keep_bools.append(bool(self.tags & words)) - else: - keep_bools.append(self.tags.issubset(words)) - keep_bools = np.array(keep_bools) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/video_watermark_filter.html b/_modules/data_juicer/ops/filter/video_watermark_filter.html deleted file mode 100644 index 7a6da7875..000000000 --- a/_modules/data_juicer/ops/filter/video_watermark_filter.html +++ /dev/null @@ -1,287 +0,0 @@ - - - - - - - - data_juicer.ops.filter.video_watermark_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.video_watermark_filter

-import numpy as np
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields, StatsKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
-                                        extract_video_frames_uniformly,
-                                        load_data_with_context, load_video)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..op_fusion import INTER_SAMPLED_FRAMES, LOADED_VIDEOS
-
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'video_watermark_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -@INTER_SAMPLED_FRAMES.register_module(OP_NAME) -class VideoWatermarkFilter(Filter): - """ - Filter to keep samples whose videos have no watermark with high - probability. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_watermark_model: str = 'amrul-hzz/watermark_detector', - trust_remote_code: bool = False, - prob_threshold: float = 0.8, - frame_sampling_method: str = 'all_keyframes', - frame_num: PositiveInt = 3, - reduce_mode: str = 'avg', - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param hf_watermark_model: watermark detection model name on - huggingface. - :param prob_threshold: the predicted watermark probability threshold - for samples. range from 0 to 1. Samples with watermark probability - less than this threshold will be kept. - :param frame_sampling_method: sampling method of extracting frame - images from the videos. - Should be one of ["all_keyframes", "uniform"]. - The former one extracts all key frames (the number of which depends - on the duration of the video) and the latter one extract specified - number of frames uniformly from the video. - Default: "all_keyframes". - :param frame_num: the number of frames to be extracted uniformly from - the video. Only works when frame_sampling_method is "uniform". If - it's 1, only the middle frame will be extracted. If it's 2, only - the first and the last frames will be extracted. If it's larger - than 2, in addition to the first and the last frames, other frames - will be extracted uniformly within the video duration. - :param reduce_mode: reduce mode for multiple sampled video frames. - 'avg': Take the average of multiple values - 'max': Take the max of multiple values - 'min': Take the min of multiple values - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all videos. 'any': keep this sample if any videos meet the - condition. 'all': keep this sample only if all videos meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.prob_threshold = prob_threshold - if frame_sampling_method not in ['all_keyframes', 'uniform']: - raise ValueError( - f'Frame sampling method ' - f'[{frame_sampling_method}] is not supported. ' - f'Can only be one of ["all_keyframes", "uniform"].') - if reduce_mode not in ['avg', 'max', 'min']: - raise ValueError(f'Reduce mode [{reduce_mode}] is not supported. ' - f'Can only be one of ["avg", "max", "min"].') - if any_or_all not in ['any', 'all']: - raise ValueError(f'Keep strategy [{any_or_all}] is not supported. ' - f'Can only be one of ["any", "all"].') - self.any = (any_or_all == 'any') - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_watermark_model, - trust_remote_code=trust_remote_code) - self.reduce_mode = reduce_mode - self.frame_sampling_method = frame_sampling_method - self.frame_num = frame_num - - self.sampled_frames_key_suffix = f'-{frame_sampling_method}' + \ - ('' if frame_sampling_method == 'all_keyframes' - else f'-{frame_num}')
- -
[docs] def compute_stats_single(self, sample, rank=None, context=False): - # check if it's computed already - if StatsKeys.video_watermark_prob in sample[Fields.stats]: - return sample - - # there is no videos in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.stats][StatsKeys.video_watermark_prob] = np.array( - [], dtype=np.float64) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - watermark_probs = [] - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - for video_key, video in videos.items(): - sampled_frames_key = video_key + self.sampled_frames_key_suffix - - # extract frame images - if context and sampled_frames_key in sample[Fields.context]: - frames = sample[Fields.context][sampled_frames_key] - else: - if self.frame_sampling_method == 'all_keyframes': - frames = extract_key_frames(video) - elif self.frame_sampling_method == 'uniform': - frames = extract_video_frames_uniformly( - video, self.frame_num) - else: - frames = [] - - # store the sampled frames in the context - if context: - sample[Fields.context][sampled_frames_key] = frames - - frame_images = [frame.to_image() for frame in frames] - - if len(frame_images) > 0: - inputs = processor(images=frame_images, return_tensors='pt') - inputs = inputs.to(model.device) - outputs = model(**inputs) - logits = outputs.logits - cur_probs = [ - probs[1] for probs in torch.softmax(logits, dim=-1) - ] - cur_probs = torch.Tensor(cur_probs) - - if self.reduce_mode == 'avg': - cur_prob = cur_probs.mean() - elif self.reduce_mode == 'max': - cur_prob = cur_probs.max() - else: - cur_prob = cur_probs.min() - else: - cur_prob = 0.0 - watermark_probs.append(float(cur_prob)) - - sample[Fields.stats][StatsKeys.video_watermark_prob] = watermark_probs - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - - return sample
- -
[docs] def process_single(self, sample, rank=None): - itm_probs = sample[Fields.stats][StatsKeys.video_watermark_prob] - if len(itm_probs) <= 0: - return True - - keep_bools = np.array( - [itm_prob < self.prob_threshold for itm_prob in itm_probs]) - - # different strategies - if self.any: - return keep_bools.any() - else: - return keep_bools.all()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/word_repetition_filter.html b/_modules/data_juicer/ops/filter/word_repetition_filter.html deleted file mode 100644 index e60beb20a..000000000 --- a/_modules/data_juicer/ops/filter/word_repetition_filter.html +++ /dev/null @@ -1,233 +0,0 @@ - - - - - - - - data_juicer.ops.filter.word_repetition_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.word_repetition_filter

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields, InterVars, StatsKeys
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      words_refinement)
-from ..op_fusion import INTER_WORDS
-
-OP_NAME = 'word_repetition_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@INTER_WORDS.register_module(OP_NAME) -class WordRepetitionFilter(Filter): - """Filter to keep samples with word-level n-gram repetition ratio within a - specific range.""" - - _batched_op = True - -
[docs] def __init__(self, - lang: str = 'en', - tokenization: bool = False, - rep_len: PositiveInt = 10, - min_ratio: float = 0.0, - max_ratio: float = 0.5, - *args, - **kwargs): - """ - Initialization method. - - :param lang: sample in which language. - :param tokenization: whether to use model to tokenize documents - :param rep_len: Repetition length for word-level n-gram. - :param min_ratio: The min filter ratio in this op, samples will - be filtered if their word-level n-gram repetition ratio is - below this parameter. - :param max_ratio: The max filter ratio in this op, samples will - be filtered if their word-level n-gram repetition ratio - exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.n = rep_len - self.min_ratio = min_ratio - self.max_ratio = max_ratio - self.model_key = None - self.lang = lang - - if tokenization: - self.model_key = prepare_model(model_type='sentencepiece', - lang=lang)
- -
[docs] def compute_stats_batched(self, samples, context=False): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - words_key = f'{InterVars.words}-{self.model_key}' - - for idx, stat in enumerate(samples_stats): - # check if it's computed already - if StatsKeys.word_rep_ratio in stat: - continue - # try to get words from context - if context and words_key in samples[Fields.context][idx]: - words = samples[Fields.context][idx][words_key] - else: - tokenizer = get_model(self.model_key) - words = get_words_from_document( - samples_list[idx], - token_func=tokenizer.encode_as_pieces - if tokenizer else None) - if context: - samples[Fields.context][idx][words_key] = words - - # try to get refined words from context - refined_words_key = f'{InterVars.refined_words}-' \ - f'True-SPECIAL_CHARS-False-[2]-' - if context and refined_words_key in samples[Fields.context][idx]: - words = samples[Fields.context][idx][refined_words_key] - else: - words = words_refinement(words, - lower_case=True, - strip_chars=SPECIAL_CHARACTERS) - if context: - samples[Fields.context][idx][refined_words_key] = words - word_ngrams = [ - ' '.join(words[i:i + self.n]) - for i in range(len(words) - self.n + 1) - ] - freq_word_ngrams = {} - for word_ngram in word_ngrams: - freq_word_ngrams[word_ngram] = ( - freq_word_ngrams.get(word_ngram, 0) + 1) - - if len(freq_word_ngrams) == 0: - samples_stats[idx][StatsKeys.word_rep_ratio] = 0.0 - continue - - freq_word_ngrams = list(freq_word_ngrams.values()) - rep_more_than_one = [freq for freq in freq_word_ngrams if freq > 1] - samples_stats[idx][StatsKeys.word_rep_ratio] = ( - sum(rep_more_than_one) / - sum(freq_word_ngrams)) if sum(freq_word_ngrams) != 0 else 0.0 - - return samples
- -
[docs] def process_batched(self, samples): - if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_ratio <= stat[StatsKeys.word_rep_ratio] - <= self.max_ratio, samples[Fields.stats]) - else: - # single sample for ray filter - if self.min_ratio <= samples[Fields.stats][ - StatsKeys.word_rep_ratio] <= self.max_ratio: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/filter/words_num_filter.html b/_modules/data_juicer/ops/filter/words_num_filter.html deleted file mode 100644 index 18c9c8c74..000000000 --- a/_modules/data_juicer/ops/filter/words_num_filter.html +++ /dev/null @@ -1,197 +0,0 @@ - - - - - - - - data_juicer.ops.filter.words_num_filter — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.filter.words_num_filter

-import sys
-
-from data_juicer.utils.constant import Fields, InterVars, StatsKeys
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Filter
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      words_refinement)
-from ..op_fusion import INTER_WORDS
-
-OP_NAME = 'words_num_filter'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@INTER_WORDS.register_module(OP_NAME) -class WordsNumFilter(Filter): - """Filter to keep samples with total words number within a specific - range.""" - - _batched_op = True - -
[docs] def __init__(self, - lang: str = 'en', - tokenization: bool = False, - min_num: int = 10, - max_num: int = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param lang: sample in which language. - :param tokenization: whether to use model to tokenize documents - :param min_num: The min filter word number in this op, samples - will be filtered if their word number is below this - parameter. - :param max_num: The max filter word number in this op, samples - will be filtered if their word number exceeds this - parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_num = min_num - self.max_num = max_num - self.model_key = None - self.lang = lang - - if tokenization: - self.model_key = prepare_model(model_type='sentencepiece', - lang=lang)
- -
[docs] def compute_stats_batched(self, samples, context=False): - samples_list = samples[self.text_key] - samples_stats = samples[Fields.stats] - words_key = f'{InterVars.words}-{self.model_key}' - - for idx, stat in enumerate(samples_stats): - # check if it's computed already - if StatsKeys.num_words in stat: - continue - if context and words_key in samples[Fields.context][idx]: - words = samples[Fields.context][idx][words_key] - else: - tokenizer = get_model(self.model_key) - words = get_words_from_document( - samples_list[idx], - token_func=tokenizer.encode_as_pieces - if tokenizer else None) - if context: - samples[Fields.context][idx][words_key] = words - words = words_refinement(words, strip_chars=SPECIAL_CHARACTERS) - samples_stats[idx][StatsKeys.num_words] = len(words) - - return samples
- -
[docs] def process_batched(self, samples): - if isinstance(samples[Fields.stats], list): - return map( - lambda stat: self.min_num <= stat[StatsKeys.num_words] <= self. - max_num, samples[Fields.stats]) - else: - # single sample for ray filter - if self.min_num <= samples[Fields.stats][ - StatsKeys.num_words] <= self.max_num: - return True - else: - return False
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/load.html b/_modules/data_juicer/ops/load.html deleted file mode 100644 index ae89a51df..000000000 --- a/_modules/data_juicer/ops/load.html +++ /dev/null @@ -1,138 +0,0 @@ - - - - - - - - data_juicer.ops.load — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.load

-from .base_op import OPERATORS
-from .op_fusion import fuse_operators
-
-
-
[docs]def load_ops(process_list, op_fusion=False): - """ - Load op list according to the process list from config file. - - :param process_list: A process list. Each item is an op name and its - arguments. - :param op_fusion: whether to fuse ops that share the same intermediate - variables. - :return: The op instance list. - """ - ops = [] - new_process_list = [] - for process in process_list: - op_name, args = list(process.items())[0] - ops.append(OPERATORS.modules[op_name](**args)) - new_process_list.append(process) - - # detect filter groups - if op_fusion: - new_process_list, ops = fuse_operators(new_process_list, ops) - - for op_cfg, op in zip(new_process_list, ops): - op._op_cfg = op_cfg - - return ops
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.html b/_modules/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.html deleted file mode 100644 index 4a49ca54c..000000000 --- a/_modules/data_juicer/ops/mapper/audio_ffmpeg_wrapped_mapper.html +++ /dev/null @@ -1,195 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.audio_ffmpeg_wrapped_mapper

-from typing import Dict, List, Optional
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.logger_utils import HiddenPrints
-
-from ..base_op import OPERATORS, Mapper
-
-with HiddenPrints():
-    ffmpeg = LazyLoader('ffmpeg', 'ffmpeg')
-
-OP_NAME = 'audio_ffmpeg_wrapped_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class AudioFFmpegWrappedMapper(Mapper): - """Simple wrapper for FFmpeg audio filters. - """ - -
[docs] def __init__( - self, - filter_name: Optional[str] = None, - filter_kwargs: Optional[Dict] = None, - global_args: Optional[List[str]] = None, - capture_stderr: bool = True, - overwrite_output: bool = True, - *args, - **kwargs, - ): - """ - Initialization method. - - :param filter_name: ffmpeg audio filter name. - :param filter_kwargs: keyword-arguments passed to ffmpeg filter. - :param global_args: list-arguments passed to ffmpeg command-line. - :param capture_stderr: whether to capture stderr. - :param overwrite_output: whether to overwrite output file. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - self.filter_name = filter_name - self.filter_kwargs = filter_kwargs - self.global_args = global_args - self.capture_stderr = capture_stderr - self.overwrite_output = overwrite_output
- -
[docs] def process_single(self, sample): - # there is no audio in this sample - if self.audio_key not in sample or not sample[self.audio_key]: - sample[Fields.source_file] = [] - return sample - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.audio_key] - - if self.filter_name is None: - return sample - - loaded_audio_keys = sample[self.audio_key] - processed = {} - for audio_key in loaded_audio_keys: - if audio_key in processed: - continue - - output_key = transfer_filename(audio_key, OP_NAME, - **self._init_parameters) - stream = (ffmpeg.input(audio_key).filter( - self.filter_name, **self.filter_kwargs).output(output_key)) - if self.global_args is not None: - stream = stream.global_args(*self.global_args) - stream.run(capture_stderr=self.capture_stderr, - overwrite_output=self.overwrite_output) - processed[audio_key] = output_key - - # when the file is modified, its source file needs to be updated. - for i, value in enumerate(loaded_audio_keys): - if sample[Fields.source_file][i] != value: - if processed[value] != value: - sample[Fields.source_file][i] = value - - sample[self.audio_key] = [processed[key] for key in loaded_audio_keys] - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/calibrate_qa_mapper.html b/_modules/data_juicer/ops/mapper/calibrate_qa_mapper.html deleted file mode 100644 index 8a9f21bef..000000000 --- a/_modules/data_juicer/ops/mapper/calibrate_qa_mapper.html +++ /dev/null @@ -1,222 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.calibrate_qa_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.calibrate_qa_mapper

-import re
-from typing import Dict, Optional
-
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-OP_NAME = 'calibrate_qa_mapper'
-
-
-# TODO: LLM-based inference.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class CalibrateQAMapper(Mapper): - """ - Mapper to calibrate question-answer pairs based on reference text. - """ - - # avoid leading whitespace - DEFAULT_SYSTEM_PROMPT = ('请根据提供的【参考信息】对【问题】和【回答】进行校准,使其更加详细、准确。\n' - '按照以下格式输出:\n' - '【问题】\n' - '校准后的问题\n' - '【回答】\n' - '校准后的回答') - DEFAULT_INPUT_TEMPLATE = '{reference}\n{qa_pair}' - DEFAULT_REFERENCE_TEMPLATE = '【参考信息】\n{}' - DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}' - DEFAULT_OUTPUT_PATTERN = r'【问题】\s*(.*?)\s*【回答】\s*(.*)' - -
[docs] def __init__(self, - api_model: str = 'gpt-4o', - *, - api_url: Optional[str] = None, - api_key: Optional[str] = None, - response_path: Optional[str] = None, - system_prompt: Optional[str] = None, - input_template: Optional[str] = None, - reference_template: Optional[str] = None, - qa_pair_template: Optional[str] = None, - output_pattern: Optional[str] = None, - model_params: Optional[Dict] = None, - sampling_params: Optional[Dict] = None, - **kwargs): - """ - Initialization method. - - :param api_model: API model name. - :param api_url: API URL. Defaults to DJ_API_URL environment variable. - :param api_key: API key. Defaults to DJ_API_KEY environment variable. - :param response_path: Path to extract content from the API response. - Defaults to 'choices.0.message.content'. - :param system_prompt: System prompt for the calibration task. - :param input_template: Template for building the model input. - :param reference_template: Template for formatting the reference text. - :param qa_pair_template: Template for formatting question-answer pairs. - :param output_pattern: Regular expression for parsing model output. - :param model_params: Parameters for initializing the model. - :param sampling_params: Extra parameters passed to the API call. - :param kwargs: Extra keyword arguments. - """ - super().__init__(**kwargs) - - self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT - self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE - self.reference_template = reference_template or \ - self.DEFAULT_REFERENCE_TEMPLATE - self.qa_pair_template = qa_pair_template or \ - self.DEFAULT_QA_PAIR_TEMPLATE - self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN - - self.model_params = model_params or {} - self.sampling_params = sampling_params or {} - self.model_key = prepare_model(model_type='api', - api_model=api_model, - api_url=api_url, - api_key=api_key, - response_path=response_path, - **model_params)
- -
[docs] def build_input(self, sample): - reference = self.reference_template.format(sample[self.text_key]) - qa_pair = self.qa_pair_template.format(sample[self.query_key], - sample[self.response_key]) - input_prompt = self.input_template.format(reference=reference, - qa_pair=qa_pair) - return input_prompt
- -
[docs] def parse_output(self, raw_output): - match = re.match(self.output_pattern, raw_output) - if match: - return match.group(1).strip(), match.group(2).strip() - else: - return None, None
- -
[docs] def process_single(self, sample=None, rank=None): - client = get_model(self.model_key, rank=rank) - - messages = [{ - 'role': 'system', - 'content': self.system_prompt - }, { - 'role': 'user', - 'content': self.build_input(sample) - }] - output = client(messages, **self.sampling_params) - - parsed_q, parsed_a = self.parse_output(output) - if parsed_q: - sample[self.query_key] = parsed_q - if parsed_a: - sample[self.response_key] = parsed_a - - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/calibrate_query_mapper.html b/_modules/data_juicer/ops/mapper/calibrate_query_mapper.html deleted file mode 100644 index c80c10693..000000000 --- a/_modules/data_juicer/ops/mapper/calibrate_query_mapper.html +++ /dev/null @@ -1,128 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.calibrate_query_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.calibrate_query_mapper

-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
-from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
-
-OP_NAME = 'calibrate_query_mapper'
-
-
-# TODO: LLM-based inference.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class CalibrateQueryMapper(CalibrateQAMapper): - """ - Mapper to calibrate query in question-answer pairs based on reference text. - """ - - DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【问题】进行校准,\ - 使其更加详细、准确,且仍可以由原答案回答。只输出校准后的问题,不要输出多余内容。' - -
[docs] def parse_output(self, raw_output): - return raw_output.strip(), None
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/calibrate_response_mapper.html b/_modules/data_juicer/ops/mapper/calibrate_response_mapper.html deleted file mode 100644 index ea5d91ec0..000000000 --- a/_modules/data_juicer/ops/mapper/calibrate_response_mapper.html +++ /dev/null @@ -1,128 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.calibrate_response_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.calibrate_response_mapper

-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
-from data_juicer.ops.mapper.calibrate_qa_mapper import CalibrateQAMapper
-
-OP_NAME = 'calibrate_response_mapper'
-
-
-# TODO: LLM-based inference.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class CalibrateResponseMapper(CalibrateQAMapper): - """ - Mapper to calibrate response in question-answer pairs based on reference text. - """ # noqa: E501 - - DEFAULT_SYSTEM_PROMPT = '请根据提供的【参考信息】对问答对中的【回答】进行校准,\ - 使其更加详细、准确,且仍可以回答原问题。只输出校准后的回答,不要输出多余内容。' - -
[docs] def parse_output(self, raw_output): - return None, raw_output.strip()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/chinese_convert_mapper.html b/_modules/data_juicer/ops/mapper/chinese_convert_mapper.html deleted file mode 100644 index decf84d9f..000000000 --- a/_modules/data_juicer/ops/mapper/chinese_convert_mapper.html +++ /dev/null @@ -1,201 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.chinese_convert_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.chinese_convert_mapper

-from data_juicer.utils.lazy_loader import LazyLoader
-
-from ..base_op import OPERATORS, Mapper
-
-opencc = LazyLoader('opencc', 'opencc')
-
-OP_NAME = 'chinese_convert_mapper'
-
-OPENCC_CONVERTER = None
-
-
-def prepare_converter(mode):
-    mode_path = mode + '.json'
-    global OPENCC_CONVERTER
-    if OPENCC_CONVERTER is None:
-        # empty converter
-        OPENCC_CONVERTER = opencc.OpenCC(mode_path)
-    if not OPENCC_CONVERTER.config.endswith(mode_path):
-        # the config is actually a config path
-        # update and get a new converter with specified mode
-        OPENCC_CONVERTER = opencc.OpenCC(mode_path)
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class ChineseConvertMapper(Mapper): - """Mapper to convert Chinese between Traditional Chinese, Simplified Chinese - and Japanese Kanji.""" - - _batched_op = True - -
[docs] def __init__(self, mode: str = 's2t', *args, **kwargs): - """ - Initialization method. - - :param mode: Choose the mode to convert Chinese: - - s2t: Simplified Chinese to Traditional Chinese, - - t2s: Traditional Chinese to Simplified Chinese, - - s2tw: Simplified Chinese to Traditional Chinese (Taiwan Standard), - - tw2s: Traditional Chinese (Taiwan Standard) to Simplified Chinese, - - s2hk: Simplified Chinese to Traditional Chinese - (Hong Kong variant), - - hk2s: Traditional Chinese (Hong Kong variant) to Simplified - Chinese, - - s2twp: Simplified Chinese to Traditional Chinese (Taiwan Standard) - with Taiwanese idiom, - - tw2sp: Traditional Chinese (Taiwan Standard) to Simplified Chinese - with Mainland Chinese idiom, - - t2tw: Traditional Chinese to Traditional Chinese (Taiwan Standard), - - tw2t: Traditional Chinese (Taiwan standard) to Traditional Chinese, - - hk2t: Traditional Chinese (Hong Kong variant) to Traditional - Chinese, - - t2hk: Traditional Chinese to Traditional Chinese - (Hong Kong variant), - - t2jp: Traditional Chinese Characters (Kyūjitai) to New Japanese - Kanji, - - jp2t: New Japanese Kanji (Shinjitai) to Traditional Chinese - Characters, - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - mode_list = [ - 's2t', 't2s', 's2tw', 'tw2s', 's2hk', 'hk2s', 's2twp', 'tw2sp', - 't2tw', 'tw2t', 'hk2t', 't2hk', 't2jp', 'jp2t' - ] - assert mode in mode_list, 'Please make sure mode is one of {}'.format( - mode_list) - self.mode = mode - prepare_converter(self.mode)
- -
[docs] def process_batched(self, samples): - prepare_converter(self.mode) - - samples[self.text_key] = [ - OPENCC_CONVERTER.convert(text) for text in samples[self.text_key] - ] - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html b/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html deleted file mode 100644 index 55605d9d1..000000000 --- a/_modules/data_juicer/ops/mapper/clean_copyright_mapper.html +++ /dev/null @@ -1,171 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.clean_copyright_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_copyright_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
-# --------------------------------------------------------
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_copyright_mapper') -class CleanCopyrightMapper(Mapper): - """Mapper to clean copyright comments at the beginning of the text - samples.""" - - _batched_op = True - -
[docs] def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pat = re.compile('/\\*[^*]*\\*+(?:[^/*][^*]*\\*+)*/') - self.cpat = re.compile('copyright', re.IGNORECASE)
- - def _process_single_sample(self, sample): - r = self.pat.search(sample) - if r: - # found one, now see if it contains "copyright", if so strip it - span = r.span() - sub = sample[span[0]:span[1]] - if self.cpat.search(sub): - # cut it - sample = sample[:span[0]] + sample[span[1]:] - - return sample - - lines = sample.split('\n') - skip = 0 - - # Greedy replace any file that begins with comment block, most - # are copyright headers - for k in range(len(lines)): - if (lines[k].startswith('//') or lines[k].startswith('#') - or lines[k].startswith('--') or not lines[k]): - skip = skip + 1 - else: - break - - if skip: - # we skipped, consume it - sample = '\n'.join(lines[skip:]) - return sample - -
[docs] def process_batched(self, samples): - samples[self.text_key] = [ - self._process_single_sample(text) - for text in samples[self.text_key] - ] - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_email_mapper.html b/_modules/data_juicer/ops/mapper/clean_email_mapper.html deleted file mode 100644 index cd5debacf..000000000 --- a/_modules/data_juicer/ops/mapper/clean_email_mapper.html +++ /dev/null @@ -1,157 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.clean_email_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_email_mapper

-from typing import Optional
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_email_mapper') -class CleanEmailMapper(Mapper): - """Mapper to clean email in text samples.""" - - _batched_op = True - -
[docs] def __init__(self, - pattern: Optional[str] = None, - repl: str = '', - *args, - **kwargs): - """ - Initialization method. - - :param pattern: regular expression pattern to search for within text. - :param repl: replacement string, default is empty string. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if pattern is None: - self.pattern = r'[A-Za-z0-9.\-+_]+@[a-z0-9.\-+_]+\.[a-z]+' - else: - self.pattern = pattern - if ((len(pattern) > 2) and - (pattern.startswith("r'") and pattern.endswith("'") - or pattern.startswith('r"') and pattern.endswith('"'))): - self.pattern = pattern[2:-1] - - self.repl = repl
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - if not re.search(self.pattern, text, flags=re.DOTALL): - continue - samples[self.text_key][idx] = re.sub(pattern=self.pattern, - repl=self.repl, - string=text, - flags=re.DOTALL) - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_html_mapper.html b/_modules/data_juicer/ops/mapper/clean_html_mapper.html deleted file mode 100644 index 324116c42..000000000 --- a/_modules/data_juicer/ops/mapper/clean_html_mapper.html +++ /dev/null @@ -1,151 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.clean_html_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_html_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
-# --------------------------------------------------------
-
-from data_juicer.utils.lazy_loader import LazyLoader
-
-from ..base_op import OPERATORS, Mapper
-
-selectolax = LazyLoader('selectolax', 'selectolax')
-
-OP_NAME = 'clean_html_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class CleanHtmlMapper(Mapper): - """Mapper to clean html code in text samples.""" - - _batched_op = True - -
[docs] def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs)
- -
[docs] def process_batched(self, samples): - - def _clean_html(raw_html): - raw_html = raw_html.replace('<li>', '\n*') - raw_html = raw_html.replace('</li>', '') - raw_html = raw_html.replace('<ol>', '\n*') - raw_html = raw_html.replace('</ol>', '') - parser = selectolax.parser.HTMLParser(raw_html) - return parser.text() - - samples[self.text_key] = [ - _clean_html(text) for text in samples[self.text_key] - ] - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html b/_modules/data_juicer/ops/mapper/clean_ip_mapper.html deleted file mode 100644 index 45d5b9a10..000000000 --- a/_modules/data_juicer/ops/mapper/clean_ip_mapper.html +++ /dev/null @@ -1,160 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.clean_ip_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_ip_mapper

-from typing import Optional
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_ip_mapper') -class CleanIpMapper(Mapper): - """Mapper to clean ipv4 and ipv6 address in text samples.""" - - _batched_op = True - -
[docs] def __init__(self, - pattern: Optional[str] = None, - repl: str = '', - *args, - **kwargs): - """ - Initialization method. - - :param pattern: regular expression pattern to search for within text. - :param repl: replacement string, default is empty string. - :param args: extra args - :param kwargs: extra args - """ - - super().__init__(*args, **kwargs) - if pattern is None: - self.pattern = r'(?:(?:1[0-9][0-9]\.)|(?:2[0-4][0-9]\.)|' - self.pattern += r'(?:25[0-5]\.)|(?:[1-9][0-9]\.)|(?:[0-9]\.))' - self.pattern += r'{3}(?:(?:1[0-9][0-9])|(?:2[0-4][0-9])|' - self.pattern += r'(?:25[0-5])|(?:[1-9][0-9])|(?:[0-9]))|' - self.pattern += r'([\da-fA-F]{1,4}:){7}[\da-fA-F]{1,4}' # ipv6 - else: - self.pattern = pattern - if ((len(pattern) > 2) and - (pattern.startswith("r'") and pattern.endswith("'") - or pattern.startswith('r"') and pattern.endswith('"'))): - self.pattern = pattern[2:-1] - self.repl = repl
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - if not re.search(self.pattern, text, flags=re.DOTALL): - continue - samples[self.text_key][idx] = re.sub(pattern=self.pattern, - repl=self.repl, - string=text, - flags=re.DOTALL) - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/clean_links_mapper.html b/_modules/data_juicer/ops/mapper/clean_links_mapper.html deleted file mode 100644 index f08d1f4eb..000000000 --- a/_modules/data_juicer/ops/mapper/clean_links_mapper.html +++ /dev/null @@ -1,167 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.clean_links_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.clean_links_mapper

-# Some code here has been modified from:
-# https://github.com/kallewesterling/CleanText/
-# --------------------------------------------------------
-from typing import Optional
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('clean_links_mapper') -class CleanLinksMapper(Mapper): - """Mapper to clean links like http/https/ftp in text samples.""" - - _batched_op = True - -
[docs] def __init__(self, - pattern: Optional[str] = None, - repl: str = '', - *args, - **kwargs): - """ - Initialization method. - - :param pattern: regular expression pattern to search for within text. - :param repl: replacement string, default is empty string. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if pattern is None: - self.pattern = r'(?i)\b(' - self.pattern += r'(?:[a-z][\w-]+:(?:\/{1,3}|' - self.pattern += r'[a-z0-9%])|www\d{0,3}[.]|' - self.pattern += r'[a-z0-9.\-]+[.][a-z]{2,4}\/)' - self.pattern += r'(?:[^\s()<>]+|\(([^\s()<>]+|' - self.pattern += r'(\([^\s()<>]+\)))*\))' - self.pattern += r'+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|' - self.pattern += r'[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])' - self.pattern += r')' - else: - self.pattern = pattern - if ((len(pattern) > 2) and - (pattern.startswith("r'") and pattern.endswith("'") - or pattern.startswith('r"') and pattern.endswith('"'))): - self.pattern = pattern[2:-1] - self.repl = repl
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - if not re.search(self.pattern, text, flags=re.DOTALL): - continue - - samples[self.text_key][idx] = re.sub(pattern=self.pattern, - repl=self.repl, - string=text, - flags=re.DOTALL) - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html b/_modules/data_juicer/ops/mapper/expand_macro_mapper.html deleted file mode 100644 index 98b9543b7..000000000 --- a/_modules/data_juicer/ops/mapper/expand_macro_mapper.html +++ /dev/null @@ -1,194 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.expand_macro_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.expand_macro_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/blob/rp_v1/data_prep/arxiv/arxiv_cleaner.py
-# --------------------------------------------------------
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('expand_macro_mapper') -class ExpandMacroMapper(Mapper): - """Mapper to expand macro definitions in the document body of Latex - samples.""" - - _batched_op = True - -
[docs] def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs)
- - def _build_non_arg_macros_dict(self, file_content): - # regex for extracting \newcommand macros without arguments - non_arg_nc_reg = re.compile( - # this regex matches the following: - # \newcommand{\macro_name}{macro_value} - # \newcommand*{\macro_name}{macro_value} - # where macro_name is only allowed to contain letters and numbers; - # macro_value can contain any character. - pattern=r'\\\bnewcommand\b\*?\{(\\[a-zA-Z0-9]+?)\}\{(.*?)\}$', - flags=re.MULTILINE) - - # regex for extracting \def macros without arguments - non_arg_def_reg = re.compile( - # this regex matches the following: - # \def\macro_name{macro_value} - # where macro_name is only allowed to contain letters and numbers; - # macro_value can contain any character. - pattern=r'\\def\s*(\\[a-zA-Z0-9]+?)\s*\{(.*?)\}$', - flags=re.MULTILINE) - - # Extract all user-defined LaTeX macros from the preamble - macros = {} - for reg in [non_arg_nc_reg, non_arg_def_reg]: - for match in reg.finditer(file_content): - # convert the macro name and value to a raw string that can be - # used in re.sub - macro_name = match.group(1).encode('unicode-escape').decode( - 'utf-8') - macro_val = match.group(2).encode('unicode-escape').decode( - 'utf-8') - - macros[macro_name] = macro_val - return macros - -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - non_arg_macros = self._build_non_arg_macros_dict(text) - - # TODO: macros that take arguments are not supported yet - arg_macros = {} - - # inline-expand all non-arg macros - for macro_name, macro_value in non_arg_macros.items(): - text = re.sub( - # make pattern grouped to make sure that the macro - # is not part of a longer alphanumeric word - pattern=r'(' + macro_name + r')' + r'([^a-zA-Z0-9])', - # replace the macro with its value and add back the - # character that was matched after the macro - repl=macro_value + r'\2', - string=text) - - # inline-expand all macros that use args - # TODO: inline-expand macros with args - for macro_name, macro_value in arg_macros.items(): - pass - - samples[self.text_key][idx] = text - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html b/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html deleted file mode 100644 index 8a1833a87..000000000 --- a/_modules/data_juicer/ops/mapper/fix_unicode_mapper.html +++ /dev/null @@ -1,151 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.fix_unicode_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.fix_unicode_mapper

-from data_juicer.utils.lazy_loader import LazyLoader
-
-from ..base_op import OPERATORS, Mapper
-
-ftfy = LazyLoader('ftfy', 'ftfy')
-
-OP_NAME = 'fix_unicode_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class FixUnicodeMapper(Mapper): - """Mapper to fix unicode errors in text samples.""" - - _batched_op = True - -
[docs] def __init__(self, normalization: str = None, *args, **kwargs): - """ - Initialization method. - - :param normalization: the specified form of Unicode - normalization mode, which can be one of - ['NFC', 'NFKC', 'NFD', and 'NFKD'], default 'NFC'. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if normalization and len(normalization) > 0: - self.normalization = normalization.upper() - else: - self.normalization = 'NFC' - - if self.normalization.upper() not in ['NFC', 'NFKC', 'NFD', 'NFKD']: - raise ValueError(f'Normalization mode [{normalization}] is not ' - 'supported. Can only be one of ' - '["NFC", "NFKC", "NFD", "NFKD"]')
- -
[docs] def process_batched(self, samples): - samples[self.text_key] = [ - ftfy.fix_text(text, normalization=self.normalization) - for text in samples[self.text_key] - ] - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html deleted file mode 100644 index 455dfa757..000000000 --- a/_modules/data_juicer/ops/mapper/generate_qa_from_examples_mapper.html +++ /dev/null @@ -1,380 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.generate_qa_from_examples_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.generate_qa_from_examples_mapper

-import json
-import random
-import re
-from typing import Dict, Optional
-
-from loguru import logger
-from pydantic import PositiveInt
-
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, UNFORKABLE, Mapper
-
-torch = LazyLoader('torch', 'torch')
-vllm = LazyLoader('vllm', 'vllm')
-rouge = LazyLoader('rouge', 'rouge')
-
-OP_NAME = 'generate_qa_from_examples_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class GenerateQAFromExamplesMapper(Mapper): - """ - Mapper to generate question and answer pairs from examples. - You should configure an empty dataset in your yaml config file: - ``` - generated_dataset_config: - type: 'EmptyFormatter' # use `RayEmptyFormatter` when enable ray - length: ${The number of generated samples} - feature_keys: ${text key} - ``` - The number of samples generated is determined by - the length of the empty dataset. - """ - - DEFAULT_SYSTEM_PROMPT = ( - '请你仔细观察多个示例数据的输入和输出,按照你的理解,总结出相应规矩,然后写出一个新的【问题】和【回答】。' - '注意,新生成的【问题】和【回答】需要满足如下要求:\n' - '1. 生成的【问题】和【回答】不能与输入的【问题】和【回答】一致,但是需要保持格式相同。\n' - '2. 生成的【问题】不一定要局限于输入【问题】的话题或领域,生成的【回答】需要正确回答生成的【问题】。\n' - '3. 提供的【问题】和【回答】可能是多轮对话,生成的【问题】和【回答】也可以是多轮,但是需要保持格式相同。\n' - '4. 生成的【问题】和【回答】必须成对出现,而且【问题】需要在【回答】之前。\n') - - DEFAULT_INPUT_TEMPLATE = '{}' - DEFAULT_EXAMPLE_TEMPLATE = '\n如下是一条示例数据:\n{}' - DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}\n' - DEFAULT_OUTPUT_PATTERN = r'【问题】(.*?)【回答】(.*?)(?=【问题】|$)' - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', - *, - seed_file: str = '', - example_num: PositiveInt = 3, - similarity_threshold: float = 0.7, - system_prompt: Optional[str] = None, - input_template: Optional[str] = None, - example_template: Optional[str] = None, - qa_pair_template: Optional[str] = None, - output_pattern: Optional[str] = None, - enable_vllm: bool = False, - model_params: Optional[Dict] = None, - sampling_params: Optional[Dict] = None, - **kwargs): - """ - Initialization method. - - :param hf_model: Hugginface model ID. - :param seed_file: Path to the seed file in chatml format. - :param example_num: The number of selected examples. - Randomly select N examples from "seed_file" and - put them into prompt as QA examples. - :param similarity_threshold: The similarity score threshold - between the generated samples and the seed examples. - Range from 0 to 1. Samples with similarity score less than - this threshold will be kept. - :param system_prompt: System prompt for guiding the generation task. - :param input_template: Template for building the input prompt. It must - include one placeholder '{}', which will be replaced by - `example_num` formatted examples defined by `example_template`. - :param example_template: Template for formatting one QA example. It - must include one placeholder '{}', which will be replaced by one - formatted qa_pair. - :param qa_pair_template: Template for formatting a single QA pair - within each example. Must include two placeholders '{}' for the - question and answer. - :param output_pattern: Regular expression pattern to extract questions - and answers from model response. - :param enable_vllm: Whether to use vllm for inference acceleration. - :param model_params: Parameters for initializing the model. - :param sampling_params: Sampling parameters for text generation. - e.g {'temperature': 0.9, 'top_p': 0.95} - :param kwargs: Extra keyword arguments. - """ - super().__init__(**kwargs) - - if not seed_file: - raise ValueError( - 'Please provide `seed_file` in chatml format.' - 'Example: data-juicer/demos/data/demo-dataset-chatml.jsonl') - - self.seed_file = seed_file - self.example_num = example_num - self.similarity_threshold = similarity_threshold - self.similarity_type = 'rouge_l' - - self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT - self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE - self.example_template = example_template or self.DEFAULT_EXAMPLE_TEMPLATE # noqa: E501 - self.qa_pair_template = qa_pair_template or \ - self.DEFAULT_QA_PAIR_TEMPLATE - self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN - - self.enable_vllm = enable_vllm - model_params = model_params or {} - sampling_params = sampling_params or {} - - if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - # cannot initialize vllm replicas on different GPUs - self.num_proc = 1 - if model_params.get('tensor_parallel_size') is None: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') - model_params['tensor_parallel_size'] = tensor_parallel_size - self.model_key = prepare_model( - model_type='vllm', - pretrained_model_name_or_path=hf_model, - **model_params) - self.sampling_params = vllm.SamplingParams(**sampling_params) - else: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_model, - return_pipe=True, - **model_params) - self.sampling_params = sampling_params - - self.seed_qa_samples = self._load_seed_qa_samples() - if len(self.seed_qa_samples) == 0: - raise ValueError('No QA data was parsed from the seed file!')
- - def _load_seed_qa_samples(self): - """Load QA pairs from chatml format file.""" - qa_samples = [] - with open(self.seed_file, encoding='utf-8') as f: - lines = f.readlines() - for line in lines: - line = line.strip() - qa_pairs = self._parse_chatml_str(line) - if len(qa_pairs) > 0: - qa_samples.append(qa_pairs) - return qa_samples - - def _sample_to_str(self, qa_sample): - return '\n'.join(['\n'.join(qa_pair) for qa_pair in qa_sample]) + '\n' - - def _max_rouge_l_score(self, hypothesis, references): - r = rouge.Rouge() - max_score = 0.0 - hyp_str = self._sample_to_str(hypothesis) - for reference in references: - ref_str = self._sample_to_str(reference) - scores = r.get_scores(hyp_str, ref_str) - rouge_l_score = scores[0]['rouge-l']['f'] - if rouge_l_score > max_score: - max_score = rouge_l_score - return max_score - - def _parse_chatml_str(self, sample_str): - user_input = None - assistant_output = None - qa_pairs = [] - data = json.loads(sample_str) - for message in data['messages']: - role = message['role'] - content = message['content'] - if role == 'user': - user_input = content - elif role == 'assistant': - assistant_output = content - qa_pairs.append((user_input, assistant_output)) - return qa_pairs - -
[docs] def build_input(self, qa_examples): - - def format_qa_pairs(qa_example): - return ''.join([ - self.qa_pair_template.format(q, a) for q, a in qa_example - if q and a - ]) - - formatted_examples = ''.join([ - self.example_template.format(qa_pairs=format_qa_pairs(qa_example)) - for qa_example in qa_examples - ]) - input_prompt = self.input_template.format(examples=formatted_examples) - return input_prompt
- -
[docs] def parse_output(self, raw_output): - logger.debug(raw_output) - output_qa_pairs = [] - matches = re.findall(self.output_pattern, raw_output, re.DOTALL) - for match in matches: - question, answer = match - output_qa_pairs.append((question.strip(), answer.strip())) - return output_qa_pairs
- -
[docs] def process_single(self, sample=None, rank=None): - model, _ = get_model(self.model_key, rank, self.use_cuda()) - - random_qa_samples = random.sample(self.seed_qa_samples, - self.example_num) - input_prompt = self.build_input(random_qa_samples) - - messages = [{ - 'role': 'system', - 'content': self.system_prompt - }, { - 'role': 'user', - 'content': input_prompt - }] - - if self.enable_vllm: - response = model.chat(messages, self.sampling_params) - output = response[0].outputs[0].text - else: - # model is pipe - response = model(messages, - return_full_text=False, - **self.sampling_params) - output = response[0]['generated_text'] - - output_qa_pairs = self.parse_output(output) - if len(output_qa_pairs) == 0: - logger.warning('Parse model response error! ' - 'No data generated for the current response!') - sample.update({ - self.query_key: '', - self.response_key: '', - self.history_key: self.empty_history() - }) - return sample - - if self.similarity_type == 'rouge_l': - sim_score = self._max_rouge_l_score(output_qa_pairs, - random_qa_samples) - else: - raise ValueError( - f'Not support similarity type "{self.similarity_type}"!') - - if sim_score <= self.similarity_threshold: - query, response = output_qa_pairs[-1] - history = output_qa_pairs[:-1] - if len(history) == 0: - history = self.empty_history() - else: - query = response = '' - history = self.empty_history() - logger.info('Filter this generated sample due to similarity.') - - sample.update({ - self.query_key: query, - self.response_key: response, - self.history_key: history - }) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html b/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html deleted file mode 100644 index 02571066e..000000000 --- a/_modules/data_juicer/ops/mapper/generate_qa_from_text_mapper.html +++ /dev/null @@ -1,255 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.generate_qa_from_text_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.generate_qa_from_text_mapper

-import re
-from typing import Dict, Optional
-
-from loguru import logger
-
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-torch = LazyLoader('torch', 'torch')
-vllm = LazyLoader('vllm', 'vllm')
-
-OP_NAME = 'generate_qa_from_text_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class GenerateQAFromTextMapper(Mapper): - """ - Mapper to generate question and answer pairs from text. - Recommended model list: [ - 'alibaba-pai/pai-llama3-8b-doc2qa', - 'alibaba-pai/pai-baichuan2-7b-doc2qa', - 'alibaba-pai/pai-qwen1_5-4b-doc2qa', - 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - 'alibaba-pai/pai-qwen1_5-1b8-doc2qa', - 'alibaba-pai/pai-qwen1_5-0b5-doc2qa' - ] - These recommended models are all trained with Chinese data - and are suitable for Chinese. - """ - - _accelerator = 'cuda' - _batched_op = True - -
[docs] def __init__(self, - hf_model: str = 'alibaba-pai/pai-qwen1_5-7b-doc2qa', - *, - output_pattern: Optional[str] = None, - enable_vllm: bool = False, - model_params: Optional[Dict] = None, - sampling_params: Optional[Dict] = None, - **kwargs): - """ - Initialization method. - - :param hf_model: Hugginface model ID. - :param output_pattern: Regular expression pattern to extract - questions and answers from model response. - :param enable_vllm: Whether to use vllm for inference acceleration. - :param model_params: Parameters for initializing the model. - :param sampling_params: Sampling parameters for text generation, - e.g {'temperature': 0.9, 'top_p': 0.95} - :param kwargs: Extra keyword arguments. - - The default data format parsed by this interface is as follows: - Model Input: - 蒙古国的首都是乌兰巴托(Ulaanbaatar) - 冰岛的首都是雷克雅未克(Reykjavik) - Model Output: - 蒙古国的首都是乌兰巴托(Ulaanbaatar) - 冰岛的首都是雷克雅未克(Reykjavik) - Human: 请问蒙古国的首都是哪里? - Assistant: 你好,根据提供的信息,蒙古国的首都是乌兰巴托(Ulaanbaatar)。 - Human: 冰岛的首都是哪里呢? - Assistant: 冰岛的首都是雷克雅未克(Reykjavik)。 - ... - """ - - super().__init__(**kwargs) - - if output_pattern is None: - self.output_pattern = r'Human:(.*?)Assistant:(.*?)(?=Human|$)' # noqa: E501 - else: - self.output_pattern = output_pattern - - self.enable_vllm = enable_vllm - model_params = model_params or {} - sampling_params = sampling_params or {} - - if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - # cannot initialize vllm replicas on different GPUs - self.num_proc = 1 - if model_params.get('tensor_parallel_size') is None: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') - model_params['tensor_parallel_size'] = tensor_parallel_size - self.model_key = prepare_model( - model_type='vllm', - pretrained_model_name_or_path=hf_model, - **model_params) - self.sampling_params = vllm.SamplingParams(**sampling_params) - else: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_model, - return_pipe=True, - **model_params) - self.sampling_params = sampling_params
- -
[docs] def parse_output(self, raw_output): - logger.debug(raw_output) - qa_list = [] - matches = re.findall(self.output_pattern, raw_output, re.DOTALL) - for match in matches: - user, assistant = match - qa_list.append((user.strip(), assistant.strip())) - return qa_list
- -
[docs] def process_batched(self, samples, rank=None): - model, _ = get_model(self.model_key, rank, self.use_cuda()) - - input_keys = samples.keys() - num_samples = len(samples[next(iter(input_keys))]) - output_keys = input_keys | {self.query_key, self.response_key} - output_samples = {key: [] for key in output_keys} - - for i in range(num_samples): - messages = [{'role': 'user', 'content': samples[self.text_key][i]}] - - if self.enable_vllm: - response = model.chat(messages, self.sampling_params) - output = response[0].outputs[0].text - else: - # model is pipe - response = model(messages, - return_full_text=False, - **self.sampling_params) - output = response[0]['generated_text'] - - qa_list = self.parse_output(output) - if len(qa_list) > 0: - for q, a in qa_list: - for input_k in input_keys: - output_samples[input_k].append(samples[input_k][i]) - output_samples[self.query_key].append(q) - output_samples[self.response_key].append(a) - else: - logger.warning( - 'No question and answer was extracted from current sample!' - ) - - return output_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/image_blur_mapper.html b/_modules/data_juicer/ops/mapper/image_blur_mapper.html deleted file mode 100644 index 958880d62..000000000 --- a/_modules/data_juicer/ops/mapper/image_blur_mapper.html +++ /dev/null @@ -1,205 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.image_blur_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.image_blur_mapper

-import os
-
-import numpy as np
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_IMAGES
-
-OP_NAME = 'image_blur_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageBlurMapper(Mapper): - """Mapper to blur images. - """ - -
[docs] def __init__(self, - p: float = 0.2, - blur_type: str = 'gaussian', - radius: float = 2, - *args, - **kwargs): - """ - Initialization method. - - :param p: Probability of the image being blured. - :param blur_type: Type of blur kernel, including - ['mean', 'box', 'gaussian']. - :param radius: Radius of blur kernel. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - if blur_type not in ['mean', 'box', 'gaussian']: - raise ValueError( - f'Blur_type [{blur_type}] is not supported. ' - f'Can only be one of ["mean", "box", "gaussian"]. ') - if radius < 0: - raise ValueError('Radius must be >= 0. ') - - self.p = p - - from PIL import ImageFilter - if blur_type == 'mean': - self.blur = ImageFilter.BLUR - elif blur_type == 'box': - self.blur = ImageFilter.BoxBlur(radius) - else: - self.blur = ImageFilter.GaussianBlur(radius)
- -
[docs] def process_single(self, sample, context=False): - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.source_file] = [] - return sample - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.image_key] - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - processed = {} - for image_key in loaded_image_keys: - if image_key in processed: - continue - - if self.p < np.random.rand(): - processed[image_key] = image_key - else: - blured_image_key = transfer_filename(image_key, OP_NAME, - **self._init_parameters) - if not os.path.exists( - blured_image_key) or blured_image_key not in images: - blured_image = images[image_key].convert('RGB').filter( - self.blur) - images[blured_image_key] = blured_image - blured_image.save(blured_image_key) - if context: - sample[Fields.context][blured_image_key] = blured_image - processed[image_key] = blured_image_key - - # when the file is modified, its source file needs to be updated. - for i, value in enumerate(loaded_image_keys): - if sample[Fields.source_file][i] != value: - if processed[value] != value: - sample[Fields.source_file][i] = value - - sample[self.image_key] = [processed[key] for key in loaded_image_keys] - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.html b/_modules/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.html deleted file mode 100644 index 7c6feedfe..000000000 --- a/_modules/data_juicer/ops/mapper/image_captioning_from_gpt4v_mapper.html +++ /dev/null @@ -1,381 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.image_captioning_from_gpt4v_mapper

-import copy
-from typing import Optional
-
-import requests
-from loguru import logger
-from pydantic import Field
-from typing_extensions import Annotated
-
-from data_juicer.utils.mm_utils import (SpecialTokens, image_byte_to_base64,
-                                        insert_texts_after_placeholders,
-                                        load_image_byte,
-                                        remove_non_special_tokens,
-                                        remove_special_tokens)
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_IMAGES
-
-SYSTEM_PROMPTS = {
-    'resoning':
-    "You are an AI visual assistant that can analyze a single image. The task is to use the provided image, create a plausible question about the image, and provide the answer in detail.\n\nYou can create complex questions beyond describing the scene. Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first.\n\nTo answer such questions, you should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request. \n\nPlease give the Q&A content directly and separate questions and answers with Q and A.",  # noqa: E501
-    'description':
-    'You are an AI visual assistant that can analyze a single image. The task is to use the provided image, create a reasonable question that describes the content of the image, and provide the answer in detail.\n\nPlease give the Q&A content directly and separate questions and answers with Q and A.',  # noqa: E501
-    'conversation':
-    'You are an AI visual assistant, and you are seeing a single image.\n\nDesign a conversation between you and a person asking about this image. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question. Ask diverse questions and give corresponding answers.\n\nInclude questions asking about the visual content of the image, including the object types, counting the objects, object actions, object locations, relative positions between objects, etc. Only include questions that have definite answers:\n(1) one can see the content in the image that the question asks about and can answer confidently;\n(2) one can determine confidently from the image that it is not in the image.\nDo not ask any question that cannot be answered confidently.\n\nConversation also include complex questions that are relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details.\nProvide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized. Please give the content of the conversation directly and separate questions and answers with Q and A'  # noqa: E501
-}
-
-
-def call_gpt_vision_api(api_key,
-                        system_prompt,
-                        user_prompt,
-                        base64_image,
-                        max_tokens=500,
-                        temperature=1.0,
-                        model='gpt-4-vision-preview'):
-    api_url = 'https://api.openai.com/v1/chat/completions'
-    headers = {
-        'Content-Type': 'application/json',
-        'Authorization': f'Bearer {api_key}'
-    }
-    data = {
-        'model':
-        model,
-        'messages': [{
-            'role': 'system',
-            'content': system_prompt
-        }, {
-            'role':
-            'user',
-            'content': [{
-                'type': 'text',
-                'text': user_prompt
-            }, {
-                'type': 'image_url',
-                'image_url': {
-                    'url': f'data:image/jpeg;base64,{base64_image}',
-                    'detail': 'low'
-                }
-            }]
-        }],
-        'max_tokens':
-        max_tokens,
-        'temperature':
-        temperature
-    }
-    try:
-        response = requests.post(api_url, headers=headers, json=data)
-        response.raise_for_status()
-        result = response.json()
-
-        if 'choices' in result and result['choices']:
-            return result['choices'][0]['text']
-        else:
-            logger.warning('No results returned from the API, return None.')
-            return None
-
-    except requests.exceptions.HTTPError as errh:
-        if errh.response.status_code == 401:
-            logger.warning('Invalid API key provided.')
-        elif errh.response.status_code == 429:
-            logger.warning(
-                'API request limit has been reached. Please try again later.')
-        else:
-            logger.warning(f'HTTP error occurred: {errh}')
-    except requests.exceptions.ConnectionError:
-        logger.warning('Network error occurred. Please check your connection.')
-    except requests.exceptions.Timeout:
-        logger.warning('The request timed out. Please try again later.')
-    except requests.exceptions.RequestException as err:
-        logger.warningt(f'An error occurred: {err}')
-    except Exception as e:
-        logger.warning(f'An unexpected error occurred: {e}')
-
-    logger.warning('API request failed, return None.')
-    return None
-
-
-
[docs]@OPERATORS.register_module('image_captioning_from_gpt4v_mapper') -@LOADED_IMAGES.register_module('image_captioning_from_gpt4v_mapper') -class ImageCaptioningFromGPT4VMapper(Mapper): - """Mapper to generate samples whose texts are generated based on - gpt-4-visison and the image.""" - - _batched_op = True - -
[docs] def __init__(self, - mode: str = 'description', - api_key: str = '', - max_token: int = 500, - temperature: Annotated[float, Field(ge=0, le=1)] = 1.0, - system_prompt: str = '', - user_prompt: str = '', - user_prompt_key: Optional[str] = None, - keep_original_sample: bool = True, - any_or_all: str = 'any', - *args, - **kwargs): - """ - Initialization method. - - :param mode: mode of text generated from images, can be one of - ['resoning', 'description', 'conversation', 'custom'] - :param api_key: the API key to authenticate the request. - :param max_token: the maximum number of tokens to generate. - Default is 500. - :param temperature: controls the randomness of the output (range - from 0 to 1). Default is 0. - :param system_prompt: a string prompt used to set the context of a - conversation and provide global guidance or rules for the - gpt4-vision so that it can generate responses in the expected way. - If `mode` set to `custom`, the parameter will be used. - :param user_prompt: a string prompt to guide the generation of - gpt4-vision for each samples. It's "" in default, which means no - prompt provided. - :param uers_prompt_key: the key name of fields in samples to store - prompts for each sample. It's used for set different prompts for - different samples. If it's none, use prompt in parameter "prompt". - It's None in default. - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only generated text in the - final datasets and the original text will be removed. It's True - in default. - :param any_or_all: keep this sample with 'any' or 'all' strategy of - all images. 'any': keep this sample if any images meet the - condition. 'all': keep this sample only if all images meet the - condition. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - - if mode not in ['resoning', 'description', 'conversation', 'custom']: - raise ValueError( - f'Mode [{mode}] is not supported. ' - f'Can only be one of ' - f'["resoning", "description", "conversation", "custom"].') - - if mode == 'custom': - self.system_prompt = system_prompt - logger.info('The parameter `mode` set to `[custom]`. Data-Juicer ' - 'will use `system_prompt` to generate text.') - else: - self.system_prompt = SYSTEM_PROMPTS[mode] - logger.info( - f'The parameter `mode` set to [{mode}]. Data-Juicer will ' - f'use default prompt to generate text.') - - self.mode = mode - self.api_key = api_key - self.max_token = max_token - self.temperature = temperature - self.user_prompt = user_prompt - self.user_prompt_key = user_prompt_key - self.keep_original_sample = keep_original_sample - self.any_or_all = any_or_all - self.extra_args = kwargs - - # report a warning when both user_prompt and user_prompt_key are set - if self.user_prompt and self.user_prompt_key: - logger.warning( - 'Both the parameter `user_prompt` and `user_prompt_key` are ' - 'set. Data-Juicer will consider `user_prompt_key` first.')
- - def _process_single_sample(self, sample): - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - return [] - - # the generated results - generated_sample = copy.deepcopy(sample) - generated_sample[self.text_key] = '' - - # load all image(s) - loaded_image_keys = sample[self.image_key] - images = {} - for loaded_image_key in loaded_image_keys: - if loaded_image_key not in images: - # avoid loading the same images - image = load_image_byte(loaded_image_key) - images[loaded_image_key] = image - - # construct user prompts - if self.user_prompt_key and isinstance(sample[self.user_prompt_key], - str): - # check user_prompt_key is not None, and it's a str in the sample - prompt_texts = sample[self.user_prompt_key] - elif self.user_prompt and isinstance(self.user_prompt, str): - # check prompt is not None, and it's a str - prompt_texts = self.user_prompt - else: - prompt_texts = '' - - offset = 0 - # do generation for each image chunk by chunk - for chunk in sample[self.text_key].split(SpecialTokens.eoc): - # skip empty chunks or contents after the last eoc token - if not chunk.strip(): - continue - - else: - img_count = chunk.count(SpecialTokens.image) - text_with_only_special_tokens = remove_non_special_tokens( - chunk) - generated_text_single_chunk = [] - for image_key in loaded_image_keys[offset:offset + img_count]: - image = images[image_key] - res = call_gpt_vision_api(self.api_key, self.system_prompt, - prompt_texts, - image_byte_to_base64(image), - self.max_token, self.temperature) - generated_text_single_chunk.append(res) - if self.any_or_all == 'all' and not all( - generated_text_single_chunk): - return [] - - # insert the generated text according to given mode - place_holders = [SpecialTokens.image] * img_count - new_generated_text_per_chunk = insert_texts_after_placeholders( - original_string=text_with_only_special_tokens, - placeholders=place_holders, - new_texts=generated_text_single_chunk) - generated_sample[ - self. - text_key] += f'{new_generated_text_per_chunk}{SpecialTokens.eoc}' # noqa: E501 - offset += img_count - if self.any_or_all == 'any' and not remove_special_tokens( - generated_sample[self.text_key]): - return [] - - return [generated_sample] - -
[docs] def process_batched(self, samples): - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - samples_after_generation = [] - # do generation for each sample within the batch - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_generation.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample) - if len(generated_samples) != 0: - samples_after_generation.extend(generated_samples) - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_generation[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_generation] - - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/image_captioning_mapper.html b/_modules/data_juicer/ops/mapper/image_captioning_mapper.html deleted file mode 100644 index d01d2f899..000000000 --- a/_modules/data_juicer/ops/mapper/image_captioning_mapper.html +++ /dev/null @@ -1,415 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.image_captioning_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.image_captioning_mapper

-import copy
-import random
-from typing import Optional
-
-import numpy as np
-from loguru import logger
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (SpecialTokens,
-                                        insert_texts_after_placeholders,
-                                        load_image, remove_non_special_tokens,
-                                        remove_special_tokens)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_IMAGES
-
-simhash = LazyLoader('simhash', 'simhash')
-
-OP_NAME = 'image_captioning_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageCaptioningMapper(Mapper): - """Mapper to generate samples whose captions are generated based on - another model and the figure.""" - - _accelerator = 'cuda' - _batched_op = True - -
[docs] def __init__(self, - hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', - trust_remote_code: bool = False, - caption_num: PositiveInt = 1, - keep_candidate_mode: str = 'random_any', - keep_original_sample: bool = True, - prompt: Optional[str] = None, - prompt_key: Optional[str] = None, - *args, - **kwargs): - """ - Initialization method. - - :param hf_img2seq: model name on huggingface to generate caption - :param caption_num: how many candidate captions to generate - for each image - :param keep_candidate_mode: retain strategy for the generated - $caption_num$ candidates. - - 'random_any': Retain the random one from generated captions - - 'similar_one_simhash': Retain the generated one that is most - similar to the original caption - - 'all': Retain all generated captions by concatenation - - Note: - This is a batched_OP, whose input and output type are - both list. Suppose there are $N$ list of input samples, whose batch - size is $b$, and denote caption_num as $M$. - The number of total samples after generation is $2Nb$ when - keep_original_sample is True and $Nb$ when keep_original_sample is - False. For 'random_any' and 'similar_one_simhash' mode, - it's $(1+M)Nb$ for 'all' mode when keep_original_sample is True - and $MNb$ when keep_original_sample is False. - - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only generated captions in the - final datasets and the original captions will be removed. It's True - in default. - :param prompt: a string prompt to guide the generation of blip2 model - for all samples globally. It's None in default, which means no - prompt provided. - :param prompt_key: the key name of fields in samples to store prompts - for each sample. It's used for set different prompts for different - samples. If it's none, use prompt in parameter "prompt". It's None - in default. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - - if keep_candidate_mode not in [ - 'random_any', 'similar_one_simhash', 'all' - ]: - raise ValueError( - f'Keep strategy [{keep_candidate_mode}] is not supported. ' - f'Can only be one of ' - f'["random_any", "similar_one_simhash", "all"].') - - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_img2seq, - trust_remote_code=trust_remote_code) - self.caption_num = caption_num - self.keep_candidate_mode = keep_candidate_mode - self.keep_original_sample = keep_original_sample - self.prompt = prompt - self.prompt_key = prompt_key - self.extra_args = kwargs - if keep_candidate_mode in ['random_any', 'similar_one_simhash']: - self.num_newly_generated_samples = 1 - elif keep_candidate_mode in ['all']: - self.num_newly_generated_samples = self.caption_num - else: - self.num_newly_generated_samples = 0 - - # report a warning when both prompt and prompt_key are set - if self.prompt and self.prompt_key: - logger.warning( - 'Both the parameter `prompt` and `prompt_key` are ' - 'set. Data-Juicer will consider `prompt_key` first.')
- - def _process_single_sample(self, ori_sample, rank=None): - """ - - :param ori_sample: a single data sample before applying generation - :return: batched results after generation - """ - # there is no image in this sample - if self.image_key not in ori_sample or \ - not ori_sample[self.image_key]: - return [] - - # the generated results - generated_samples = [ - copy.deepcopy(ori_sample) - for _ in range(self.num_newly_generated_samples) - ] - for generated_sample in generated_samples: - generated_sample[self.text_key] = '' - - # 1. load all image(s) - loaded_image_keys = ori_sample[self.image_key] - images = {} - for loaded_image_key in loaded_image_keys: - if loaded_image_key not in images: - # avoid loading the same images - image = load_image(loaded_image_key) - images[loaded_image_key] = image - - offset = 0 - - # we follow such assumption: - # all text/img/video/audio data within a chunk are correlated. - # As a result, - # the original text will be removed, - # the generated text will be placed following each SpecialTokens.img - # and the original special tokens are kept in an order-preserving way. - - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - # do generation for each image chunk by chunk - for chunk in ori_sample[self.text_key].split(SpecialTokens.eoc): - # skip empty chunks or contents after the last eoc token - if not chunk.strip(): - continue - - img_count = chunk.count(SpecialTokens.image) - text_with_only_special_tokens = remove_non_special_tokens(chunk) - image_chunk = [] - for image_key in loaded_image_keys[offset:offset + img_count]: - image = images[image_key] - image_chunk.append(image) - - # 2. generate candidate caption(s) in batch manner - generated_text_candidates_single_chunk = \ - [[] for _ in range(self.caption_num)] - # an assistant 2-D array, - # generated_text_candidates_single_chunk[i][j] indicates - # the $i$-th generated candidate for the $j$-th image - - # construct prompts - if self.prompt_key \ - and isinstance(ori_sample[self.prompt_key], str): - # check prompt_key is not None, and it's a str in the sample - prompt_texts = [ori_sample[self.prompt_key]] * len(image_chunk) - elif self.prompt and isinstance(self.prompt, str): - # check prompt is not None, and it's a str - prompt_texts = [self.prompt] * len(image_chunk) - else: - prompt_texts = None - - inputs = processor(images=image_chunk, - text=prompt_texts, - return_tensors='pt').to(model.device) - for i in range(self.caption_num): - generated_ids = model.generate(**inputs, - max_new_tokens=128, - do_sample=True) - generated_text = processor.batch_decode( - generated_ids, skip_special_tokens=True) - generated_text_candidates_single_chunk[i] = generated_text - - # 3. insert a list of generated captions into the positions of - # subsequent placeholders in the original string - new_generated_text_all_images = \ - [[] for _ in range(self.num_newly_generated_samples)] - # new_generated_text_all_images is a helper array, element [i][j] - # denotes the reduced $i$-th result for the $j$-th image - - # reduce the captions according to given mode image by image - for j in range(img_count): - new_generated_text_per_image = self._reduce_captions_per_image( - chunk, [ - captions[j] - for captions in generated_text_candidates_single_chunk - ]) - assert self.num_newly_generated_samples == \ - len(new_generated_text_per_image) - for i in range(len(new_generated_text_per_image)): - new_generated_text_all_images[i].append( - new_generated_text_per_image[i]) - - # insert the captions according to given mode - place_holders = [SpecialTokens.image] * img_count - for i in range(self.num_newly_generated_samples): - new_generated_text_per_chunk = insert_texts_after_placeholders( - original_string=text_with_only_special_tokens, - placeholders=place_holders, - new_texts=new_generated_text_all_images[i]) - generated_samples[i][self.text_key] += \ - f'{new_generated_text_per_chunk}{SpecialTokens.eoc}' - - offset += img_count - - return generated_samples - - def _reduce_captions_per_image(self, chunk, - generated_text_candidates_single_chunk): - new_generated_text_per_chunk = [] - if self.keep_candidate_mode == 'random_any': - new_generated_text_per_chunk.append( - random.choice(generated_text_candidates_single_chunk)) - elif self.keep_candidate_mode == 'all': - new_generated_text_per_chunk.extend( - generated_text_candidates_single_chunk) - elif self.keep_candidate_mode == 'similar_one_simhash': - - from ..deduplicator.document_simhash_deduplicator import \ - DocumentSimhashDeduplicator - ori_normal_text = remove_special_tokens(chunk) - # using a simhash OP to calculate their similarity - # NOTE: simhash is just one method to calculate the similarities - # between texts, but not the most accurate one. More methods (e.g. - # embedding-based, ...) will be added. - op_simhash = DocumentSimhashDeduplicator(window_size=2, - **self.extra_args) - ori_text_hash = np.uint64( - op_simhash.compute_hash({op_simhash.text_key: - ori_normal_text})[HashKeys.simhash]) - generated_text_hashes = [ - np.uint64( - op_simhash.compute_hash( - {op_simhash.text_key: - candidate_text})[HashKeys.simhash]) - for candidate_text in generated_text_candidates_single_chunk - ] - hamming_distances = [ - simhash.num_differing_bits(ori_text_hash, generated_text_hash) - for generated_text_hash in generated_text_hashes - ] - max_index = min(range(len(hamming_distances)), - key=hamming_distances.__getitem__) - new_generated_text_per_chunk.append( - generated_text_candidates_single_chunk[max_index]) - return new_generated_text_per_chunk - -
[docs] def process_batched(self, samples, rank=None): - """ - Note: - This is a batched_OP, whose input and output type are - both list. Suppose there are $N$ input sample list with batch - size as $b$, and denote caption_num as $M$. - the number of total samples after generation is $2Nb$ - for 'random_any' and 'similar_one' mode, - and $(1+M)Nb$ for 'all' mode. - - :param samples: - :return: - """ - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - samples_after_generation = [] - # do generation for each sample within the batch - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_generation.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample, - rank=rank) - if len(generated_samples) != 0: - samples_after_generation.extend(generated_samples) - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_generation[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_generation] - - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/image_diffusion_mapper.html b/_modules/data_juicer/ops/mapper/image_diffusion_mapper.html deleted file mode 100644 index db0eda7f8..000000000 --- a/_modules/data_juicer/ops/mapper/image_diffusion_mapper.html +++ /dev/null @@ -1,351 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.image_diffusion_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.image_diffusion_mapper

-import copy
-import os
-from typing import Optional
-
-from PIL import Image
-from pydantic import Field, PositiveInt
-from typing_extensions import Annotated
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.mm_utils import (SpecialTokens, load_data_with_context,
-                                        load_image, remove_special_tokens)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_IMAGES
-
-OP_NAME = 'image_diffusion_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageDiffusionMapper(Mapper): - """ - Generate image by diffusion model - """ - - _accelerator = 'cuda' - _batched_op = True - -
[docs] def __init__(self, - hf_diffusion: str = 'CompVis/stable-diffusion-v1-4', - trust_remote_code: bool = False, - torch_dtype: str = 'fp32', - revision: str = 'main', - strength: Annotated[float, Field(ge=0, le=1)] = 0.8, - guidance_scale: float = 7.5, - aug_num: PositiveInt = 1, - keep_original_sample: bool = True, - caption_key: Optional[str] = None, - hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', - *args, - **kwargs): - """ - Initialization method. - - :param hf_diffusion: diffusion model name on huggingface to generate - the image. - :param torch_dtype: the floating point type used to load the diffusion - model. Can be one of ['fp32', 'fp16', 'bf16'] - :param revision: The specific model version to use. It can be a - branch name, a tag name, a commit id, or any identifier allowed - by Git. - :param strength: Indicates extent to transform the reference image. - Must be between 0 and 1. image is used as a starting point and - more noise is added the higher the strength. The number of - denoising steps depends on the amount of noise initially added. - When strength is 1, added noise is maximum and the denoising - process runs for the full number of iterations specified in - num_inference_steps. A value of 1 essentially ignores image. - :param guidance_scale: A higher guidance scale value encourages the - model to generate images closely linked to the text prompt at the - expense of lower image quality. Guidance scale is enabled when - guidance_scale > 1. - :param aug_num: The image number to be produced by stable-diffusion - model. - :param keep_candidate_mode: retain strategy for the generated - $caption_num$ candidates. - - 'random_any': Retain the random one from generated captions - - 'similar_one_simhash': Retain the generated one that is most - similar to the original caption - - 'all': Retain all generated captions by concatenation - - Note: - This is a batched_OP, whose input and output type are - both list. Suppose there are $N$ list of input samples, whose batch - size is $b$, and denote caption_num as $M$. - The number of total samples after generation is $2Nb$ when - keep_original_sample is True and $Nb$ when keep_original_sample is - False. For 'random_any' and 'similar_one_simhash' mode, - it's $(1+M)Nb$ for 'all' mode when keep_original_sample is True - and $MNb$ when keep_original_sample is False. - - :param caption_key: the key name of fields in samples to store captions - for each images. It can be a string if there is only one image in - each sample. Otherwise, it should be a list. If it's none, - ImageDiffusionMapper will produce captions for each images. - :param hf_img2seq: model name on huggingface to generate caption if - caption_key is None. - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - self.strength = strength - self.guidance_scale = guidance_scale - self.aug_num = aug_num - self.keep_original_sample = keep_original_sample - self.caption_key = caption_key - self.prompt = 'A photo of a ' - if not self.caption_key: - from .image_captioning_mapper import ImageCaptioningMapper - self.op_generate_caption = ImageCaptioningMapper( - hf_img2seq=hf_img2seq, - keep_original_sample=False, - prompt=self.prompt) - self.model_key = prepare_model( - model_type='diffusion', - pretrained_model_name_or_path=hf_diffusion, - diffusion_type='image2image', - torch_dtype=torch_dtype, - revision=revision, - trust_remote_code=trust_remote_code)
- - def _real_guidance(self, caption: str, image: Image.Image, rank=None): - - canvas = image.resize((512, 512), Image.BILINEAR) - prompt = caption - - diffusion_model = get_model(model_key=self.model_key, - rank=rank, - use_cuda=self.use_cuda()) - - kwargs = dict(image=canvas, - prompt=[prompt], - strength=self.strength, - guidance_scale=self.guidance_scale) - - has_nsfw_concept = True - while has_nsfw_concept: - outputs = diffusion_model(**kwargs) - - has_nsfw_concept = (diffusion_model.safety_checker is not None - and outputs.nsfw_content_detected[0]) - - canvas = outputs.images[0].resize(image.size, Image.BILINEAR) - - return canvas - - def _process_single_sample(self, ori_sample, rank=None, context=False): - """ - :param ori_sample: a single data sample before applying generation - :return: batched results after generation - """ - # there is no image in this sample - if self.image_key not in ori_sample or \ - not ori_sample[self.image_key]: - return [] - - # load images - loaded_image_keys = ori_sample[self.image_key] - ori_sample, images = load_data_with_context(ori_sample, context, - loaded_image_keys, - load_image) - - # load captions - if self.caption_key: - captions = ori_sample[self.caption_key] - if not isinstance(captions, list): - # one caption for all images - captions = [captions] * len(images) - else: - assert len(captions) == len( - images - ), 'The num of captions must match the num of images.' - captions = [remove_special_tokens(c) for c in captions] - else: - caption_samples = { - self.text_key: [SpecialTokens.image] * len(images), - self.image_key: [[k] for k in loaded_image_keys] - } - caption_samples = self.op_generate_caption.process(caption_samples, - rank=rank) - captions = caption_samples[self.text_key] - captions = [ - self.prompt + remove_special_tokens(c) for c in captions - ] - - # the generated results - generated_samples = [ - copy.deepcopy(ori_sample) for _ in range(self.aug_num) - ] - - for aug_id in range(self.aug_num): - diffusion_image_keys = [] - for index, value in enumerate(loaded_image_keys): - related_parameters = self.add_parameters( - self._init_parameters, caption=captions[index]) - diffusion_image_key = transfer_filename( - value, OP_NAME, **related_parameters) - diffusion_image_keys.append(diffusion_image_key) - # TODO: duplicated generation if image is reused - if not os.path.exists(diffusion_image_key - ) or diffusion_image_key not in images: - diffusion_image = self._real_guidance(captions[index], - images[value], - rank=rank) - images[diffusion_image_key] = diffusion_image - diffusion_image.save(diffusion_image_key) - if context: - generated_samples[aug_id][Fields.context][ - diffusion_image_key] = diffusion_image - generated_samples[aug_id][self.image_key] = diffusion_image_keys - - return generated_samples - -
[docs] def process_batched(self, samples, rank=None, context=False): - """ - Note: - This is a batched_OP, whose the input and output type are - both list. Suppose there are $N$ input sample list with batch - size as $b$, and denote aug_num as $M$. - the number of total samples after generation is $(1+M)Nb$. - - :param samples: - :return: - """ - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - - # do generation for each sample within the batch - samples_after_generation = [] - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_generation.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample, - rank=rank) - if len(generated_samples) != 0: - samples_after_generation.extend(generated_samples) - - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_generation[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_generation] - - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/image_face_blur_mapper.html b/_modules/data_juicer/ops/mapper/image_face_blur_mapper.html deleted file mode 100644 index edb14a662..000000000 --- a/_modules/data_juicer/ops/mapper/image_face_blur_mapper.html +++ /dev/null @@ -1,245 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.image_face_blur_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.image_face_blur_mapper

-import os
-
-from loguru import logger
-from PIL import ImageFilter
-from pydantic import NonNegativeFloat
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (detect_faces, load_data_with_context,
-                                        load_image)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, UNFORKABLE, Mapper
-from ..op_fusion import LOADED_IMAGES
-
-cv2 = LazyLoader('cv2', 'cv2')
-
-OP_NAME = 'image_face_blur_mapper'
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageFaceBlurMapper(Mapper): - """Mapper to blur faces detected in images. - """ - - _default_kwargs = { - 'scaleFactor': 1.1, - 'minNeighbors': 3, - 'minSize': None, - 'maxSize': None, - } - -
[docs] def __init__(self, - cv_classifier: str = '', - blur_type: str = 'gaussian', - radius: NonNegativeFloat = 2, - *args, - **kwargs): - """ - Initialization method. - - :param cv_classifier: OpenCV classifier path for face detection. - By default, we will use 'haarcascade_frontalface_alt.xml'. - :param blur_type: Type of blur kernel, including - ['mean', 'box', 'gaussian']. - :param radius: Radius of blur kernel. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - if cv_classifier == '': - cv_classifier = os.path.join(cv2.data.haarcascades, - 'haarcascade_frontalface_alt.xml') - if blur_type not in ['mean', 'box', 'gaussian']: - raise ValueError( - f'Blur_type [{blur_type}] is not supported. ' - f'Can only be one of ["mean", "box", "gaussian"]. ') - if radius < 0: - raise ValueError('Radius must be >= 0. ') - - if blur_type == 'mean': - self.blur = ImageFilter.BLUR - elif blur_type == 'box': - self.blur = ImageFilter.BoxBlur(radius) - else: - self.blur = ImageFilter.GaussianBlur(radius) - - self.blur_type = blur_type - self.radius = radius - - self.extra_kwargs = self._default_kwargs - for key in kwargs: - if key in self.extra_kwargs: - self.extra_kwargs[key] = kwargs[key] - - self.model_key = prepare_model(model_type='opencv_classifier', - model_path=cv_classifier)
- -
[docs] def process_single(self, sample, context=False): - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[Fields.source_file] = [] - return sample - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.image_key] - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - model = get_model(self.model_key) - - # detect faces - face_detections = {} - for key, image in images.items(): - face_detections[key] = detect_faces(image, model, - **self.extra_kwargs) - logger.debug(f'detections: {face_detections}') - - # blur face regions - key_mapping = {} - for key, image in images.items(): - dets = face_detections[key] - # only blur when detected face - if len(dets) > 0: - blured_image = image.copy() - for (x, y, w, h) in dets: - box = (x, y, x + w, y + h) - blured_roi = image.crop(box).filter(self.blur) - blured_image.paste(blured_roi, box) - blured_image_key = transfer_filename(key, OP_NAME, - **self._init_parameters) - blured_image.save(blured_image_key) - key_mapping[key] = blured_image_key - if context: - sample[Fields.context][blured_image_key] = blured_image - else: - key_mapping[key] = key - - # when the file is modified, its source file needs to be updated. - for i, value in enumerate(loaded_image_keys): - if sample[Fields.source_file][i] != value: - if key_mapping[value] != value: - sample[Fields.source_file][i] = value - - sample[self.image_key] = [ - key_mapping[key] for key in loaded_image_keys - ] - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/image_tagging_mapper.html b/_modules/data_juicer/ops/mapper/image_tagging_mapper.html deleted file mode 100644 index f8f2e5e99..000000000 --- a/_modules/data_juicer/ops/mapper/image_tagging_mapper.html +++ /dev/null @@ -1,187 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.image_tagging_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.image_tagging_mapper

-from collections import Counter
-
-import numpy as np
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import load_data_with_context, load_image
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, UNFORKABLE, Mapper
-from ..op_fusion import LOADED_IMAGES
-
-torch = LazyLoader('torch', 'torch')
-ram = LazyLoader('ram', 'ram')
-
-OP_NAME = 'image_tagging_mapper'
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -@LOADED_IMAGES.register_module(OP_NAME) -class ImageTaggingMapper(Mapper): - """Mapper to generate image tags. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - tag_field_name: str = Fields.image_tags, - *args, - **kwargs): - """ - Initialization method. - :param tag_field_name: the field name to store the tags. It's - "__dj__image_tags__" in default. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.model_key = prepare_model( - model_type='recognizeAnything', - pretrained_model_name_or_path='ram_plus_swin_large_14m.pth', - input_size=384) - self.transform = ram.get_transform(image_size=384) - self.tag_field_name = tag_field_name
- -
[docs] def process_single(self, sample, rank=None, context=False): - # check if it's generated already - if self.tag_field_name in sample: - return sample - - # there is no image in this sample - if self.image_key not in sample or not sample[self.image_key]: - sample[self.tag_field_name] = np.array([[]], dtype=np.str_) - return sample - - # load images - loaded_image_keys = sample[self.image_key] - sample, images = load_data_with_context(sample, context, - loaded_image_keys, load_image) - - model = get_model(self.model_key, rank, self.use_cuda()) - image_tags = [] - for _, value in enumerate(loaded_image_keys): - image = images[value] - - image_tensor = torch.unsqueeze(self.transform(image), dim=0).to( - next(model.parameters()).device) - with torch.no_grad(): - tags, _ = model.generate_tag(image_tensor) - - words = [word.strip() for word in tags[0].split('|')] - word_count = Counter(words) - sorted_word_list = [item for item, _ in word_count.most_common()] - image_tags.append(np.array(sorted_word_list, dtype=np.str_)) - - sample[self.tag_field_name] = image_tags - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html b/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html deleted file mode 100644 index b9bd6df00..000000000 --- a/_modules/data_juicer/ops/mapper/nlpaug_en_mapper.html +++ /dev/null @@ -1,266 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.nlpaug_en_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.nlpaug_en_mapper

-from copy import deepcopy
-
-from loguru import logger
-from pydantic import PositiveInt
-
-from data_juicer.utils.lazy_loader import LazyLoader
-
-from ..base_op import OPERATORS, Mapper
-
-nlpaug = LazyLoader('nlpaug', 'nlpaug')
-nac = LazyLoader('nac', 'nlpaug.augmenter.char')
-naw = LazyLoader('naw', 'nlpaug.augmenter.word')
-naf = LazyLoader('naf', 'nlpaug.flow')
-
-OP_NAME = 'nlpaug_en_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class NlpaugEnMapper(Mapper): - """Mapper to simply augment samples in English based on nlpaug library.""" - - _batched_op = True - -
[docs] def __init__(self, - sequential: bool = False, - aug_num: PositiveInt = 1, - keep_original_sample: bool = True, - delete_random_word: bool = False, - swap_random_word: bool = False, - spelling_error_word: bool = False, - split_random_word: bool = False, - keyboard_error_char: bool = False, - ocr_error_char: bool = False, - delete_random_char: bool = False, - swap_random_char: bool = False, - insert_random_char: bool = False, - *args, - **kwargs): - """ - Initialization method. All augmentation methods use default parameters - in default. We recommend you to only use 1-3 augmentation methods at a - time. Otherwise, the semantics of samples might be changed - significantly. - - :param sequential: whether combine all augmentation methods to a - sequence. If it's True, a sample will be augmented by all opened - augmentation methods sequentially. If it's False, each opened - augmentation method would generate its augmented samples - independently. - :param aug_num: number of augmented samples to be generated. If - `sequential` is True, there will be total aug_num augmented samples - generated. If it's False, there will be (aug_num * - #opened_aug_method) augmented samples generated. - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only generated texts in the final - datasets and the original texts will be removed. It's True in - default. - :param delete_random_word: whether to open the augmentation method of - deleting random words from the original texts. e.g. "I love LLM" - --> "I LLM" - :param swap_random_word: whether to open the augmentation method of - swapping random contiguous words in the original texts. e.g. "I - love LLM" --> "Love I LLM" - :param spelling_error_word: whether to open the augmentation method of - simulating the spelling error for words in the original texts. e.g. - "I love LLM" --> "Ai love LLM" - :param split_random_word: whether to open the augmentation method of - splitting words randomly with whitespaces in the original texts. - e.g. "I love LLM" --> "I love LL M" - :param keyboard_error_char: whether to open the augmentation method of - simulating the keyboard error for characters in the original texts. - e.g. "I love LLM" --> "I ;ov4 LLM" - :param ocr_error_char: whether to open the augmentation method of - simulating the OCR error for characters in the original texts. - e.g. "I love LLM" --> "I 10ve LLM" - :param delete_random_char: whether to open the augmentation method of - deleting random characters from the original texts. e.g. "I love - LLM" --> "I oe LLM" - :param swap_random_char: whether to open the augmentation method of - swapping random contiguous characters in the original texts. - e.g. "I love LLM" --> "I ovle LLM" - :param insert_random_char: whether to open the augmentation method of - inserting random characters into the original texts. e.g. "I love - LLM" --> "I ^lKove LLM" - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - - self.aug_num = aug_num - if aug_num >= 10: - logger.warning(f'Relatively large augmentation number [{aug_num}]' - f' might generate large number of new samples and ' - f'requires more memory and disk space.') - self.sequential = sequential - self.keep_original_sample = keep_original_sample - - aug_pipeline = [] - # word level - Action = nlpaug.util.Action - if delete_random_word: - aug_pipeline.append(naw.RandomWordAug(action=Action.DELETE)) - if swap_random_word: - aug_pipeline.append(naw.RandomWordAug(action=Action.SWAP)) - if spelling_error_word: - aug_pipeline.append(naw.SpellingAug()) - if split_random_word: - aug_pipeline.append(naw.SplitAug()) - - # char level - if keyboard_error_char: - aug_pipeline.append(nac.KeyboardAug()) - if ocr_error_char: - aug_pipeline.append(nac.OcrAug()) - if delete_random_char: - aug_pipeline.append(nac.RandomCharAug(action=Action.DELETE)) - if swap_random_char: - aug_pipeline.append(nac.RandomCharAug(action=Action.SWAP)) - if insert_random_char: - aug_pipeline.append(nac.RandomCharAug(action=Action.INSERT)) - - if self.sequential: - self.aug = naf.Sequential(aug_pipeline) - else: - self.aug = aug_pipeline
- -
[docs] def process_batched(self, samples): - # no augmentation methods are opened - if len(self.aug) == 0: - if self.keep_original_sample: - return samples - else: - return {key: [] for key in samples} - - texts_to_aug = samples[self.text_key][0] # batch_size = 1 - res_samples = deepcopy(samples) - - # get augmented texts - if self.sequential: - aug_texts = self.aug.augment(texts_to_aug, n=self.aug_num) - else: - # apply each aug method to generate several augmented texts - aug_texts = [] - for aug_method in self.aug: - aug_texts += aug_method.augment(texts_to_aug, n=self.aug_num) - - # add augmented samples to the batch with other replicate fields - if self.keep_original_sample: - res_samples[self.text_key] += aug_texts - else: - res_samples[self.text_key] = aug_texts - # add other replicate fields - for key in res_samples: - if key != self.text_key: - res_samples[key] = res_samples[key] * \ - len(res_samples[self.text_key]) - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html b/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html deleted file mode 100644 index 0adb96df3..000000000 --- a/_modules/data_juicer/ops/mapper/nlpcda_zh_mapper.html +++ /dev/null @@ -1,280 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.nlpcda_zh_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.nlpcda_zh_mapper

-from copy import deepcopy
-
-from loguru import logger
-from pydantic import PositiveInt
-
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.logger_utils import HiddenPrints
-
-from ..base_op import OPERATORS, Mapper
-
-nlpcda = LazyLoader('nlpcda', 'nlpcda')
-
-OP_NAME = 'nlpcda_zh_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class NlpcdaZhMapper(Mapper): - """Mapper to simply augment samples in Chinese based on nlpcda library.""" - - _batched_op = True - -
[docs] def __init__(self, - sequential: bool = False, - aug_num: PositiveInt = 1, - keep_original_sample: bool = True, - replace_similar_word: bool = False, - replace_homophone_char: bool = False, - delete_random_char: bool = False, - swap_random_char: bool = False, - replace_equivalent_num: bool = False, - *args, - **kwargs): - """ - Initialization method. All augmentation methods use default parameters - in default. We recommend you to only use 1-3 augmentation methods at a - time. Otherwise, the semantics of samples might be changed - significantly. **Notice**: some augmentation method might not work for - some special texts, so there might be no augmented texts generated. - - :param sequential: whether combine all augmentation methods to a - sequence. If it's True, a sample will be augmented by all opened - augmentation methods sequentially. If it's False, each opened - augmentation method would generate its augmented samples - independently. - :param aug_num: number of augmented samples to be generated. If - `sequential` is True, there will be total aug_num augmented samples - generated. If it's False, there will be (aug_num * - #opened_aug_method) augmented samples generated. - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only generated texts in the final - datasets and the original texts will be removed. It's True in - default. - :param replace_similar_word: whether to open the augmentation method of - replacing random words with their similar words in the original - texts. e.g. "这里一共有5种不同的数据增强方法" --> "这边一共有5种不同的数据增强方法" - :param replace_homophone_char: whether to open the augmentation method - of replacing random characters with their homophones in the - original texts. e.g. "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的濖据增强方法" - :param delete_random_char: whether to open the augmentation method of - deleting random characters from the original texts. e.g. - "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据增强" - :param swap_random_char: whether to open the augmentation method of - swapping random contiguous characters in the original texts. e.g. - "这里一共有5种不同的数据增强方法" --> "这里一共有5种不同的数据强增方法" - :param replace_equivalent_num: whether to open the augmentation method - of replacing random numbers with their equivalent representations - in the original texts. **Notice**: Only for numbers for now. e.g. - "这里一共有5种不同的数据增强方法" --> "这里一共有伍种不同的数据增强方法" - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - - self.aug_num = aug_num - if aug_num >= 10: - logger.warning(f'Relatively large augmentation number [{aug_num}]' - f' might generate large number of new samples and ' - f'requires more memory and disk space.') - self.sequential = sequential - self.keep_original_sample = keep_original_sample - - # hide the redundant outputs from nlpcda library - with HiddenPrints(): - import warnings - warnings.filterwarnings('ignore') - - self.aug_pipeline = [] - # sample level - - # word level - if replace_similar_word: - # the first sample of augmented sample list is the same as the - # original sample, so we need generate one more augmented - # sample to get the expected number of augmented samples. Same - # below - create_num = (self.aug_num + 1) \ - if not self.sequential or len(self.aug_pipeline) == 0 \ - else 2 - self.aug_pipeline.append( - nlpcda.Similarword(create_num=create_num)) - - # char level - if replace_homophone_char: - create_num = (self.aug_num + 1) \ - if not self.sequential or len(self.aug_pipeline) == 0 \ - else 2 - self.aug_pipeline.append( - nlpcda.Homophone(create_num=create_num)) - if delete_random_char: - create_num = (self.aug_num + 1) \ - if not self.sequential or len(self.aug_pipeline) == 0 \ - else 2 - self.aug_pipeline.append( - nlpcda.RandomDeleteChar(create_num=create_num)) - if swap_random_char: - create_num = (self.aug_num + 1) \ - if not self.sequential or len(self.aug_pipeline) == 0 \ - else 2 - # only use char_gram=1 for relatively minor changes - self.aug_pipeline.append( - nlpcda.CharPositionExchange(create_num=create_num, - char_gram=1)) - - # only for numbers now - if replace_equivalent_num: - create_num = (self.aug_num + 1) \ - if not self.sequential or len(self.aug_pipeline) == 0 \ - else 2 - self.aug_pipeline.append( - nlpcda.EquivalentChar(create_num=create_num))
- -
[docs] def process_batched(self, samples): - # no augmentation methods are opened - if len(self.aug_pipeline) == 0: - if self.keep_original_sample: - return samples - else: - return {key: [] for key in samples} - - texts_to_aug = samples[self.text_key] - res_samples = deepcopy(samples) - - # get augmented texts - if self.sequential: - aug_texts = texts_to_aug - for aug_method in self.aug_pipeline: - results = [] - for text in aug_texts: - # aug and skip the original text - result = aug_method.replace(text) - results += result[1:] if len(result) > 1 else result - aug_texts = results[:] - if len(aug_texts) == 1 and aug_texts[0] == texts_to_aug[0]: - aug_texts = [] - else: - # apply each aug method to generate several augmented texts - aug_texts = [] - for aug_method in self.aug_pipeline: - aug_texts += aug_method.replace(texts_to_aug[0])[1:] - - # add augmented samples to the batch with other replicate fields - if self.keep_original_sample: - res_samples[self.text_key] += aug_texts - else: - res_samples[self.text_key] = aug_texts - # add other replicate fields - for key in res_samples: - if key != self.text_key: - res_samples[key] = res_samples[key] * \ - len(res_samples[self.text_key]) - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html b/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html deleted file mode 100644 index a77404dc8..000000000 --- a/_modules/data_juicer/ops/mapper/optimize_qa_mapper.html +++ /dev/null @@ -1,253 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.optimize_qa_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.optimize_qa_mapper

-import re
-from typing import Dict, Optional
-
-from loguru import logger
-
-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE, Mapper
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-torch = LazyLoader('torch', 'torch')
-vllm = LazyLoader('vllm', 'vllm')
-
-OP_NAME = 'optimize_qa_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class OptimizeQAMapper(Mapper): - """ - Mapper to optimize question-answer pairs. - """ - - # avoid leading whitespace - DEFAULT_SYSTEM_PROMPT = ('请优化输入的问答对,使【问题】和【回答】都更加详细、准确。' - '必须按照以下标记格式,直接输出优化后的问答对:\n' - '【问题】\n' - '优化后的问题\n' - '【回答】\n' - '优化后的回答') - DEFAULT_INPUT_TEMPLATE = '以下是原始问答对:\n{}' - DEFAULT_QA_PAIR_TEMPLATE = '【问题】\n{}\n【回答】\n{}' - DEFAULT_OUTPUT_PATTERN = r'.*?【问题】\s*(.*?)\s*【回答】\s*(.*)' - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_model: str = 'Qwen/Qwen2.5-7B-Instruct', - *, - system_prompt: Optional[str] = None, - input_template: Optional[str] = None, - qa_pair_template: Optional[str] = None, - output_pattern: Optional[str] = None, - enable_vllm: bool = False, - model_params: Optional[Dict] = None, - sampling_params: Optional[Dict] = None, - **kwargs): - """ - Initialization method. - - :param hf_model: Hugging Face model ID. - :param system_prompt: System prompt for guiding the optimization task. - :param input_template: Template for building the input for the model. - Please make sure the template contains one placeholder '{}', which - corresponds to the question and answer pair generated by - param `qa_pair_template`. - :param qa_pair_template: Template for formatting the question and - answer pair. Please make sure the template contains two - '{}' to format question and answer. - :param output_pattern: Regular expression pattern to extract question - and answer from model response. - :param enable_vllm: Whether to use VLLM for inference acceleration. - :param model_params: Parameters for initializing the model. - :param sampling_params: Sampling parameters for text generation (e.g., - {'temperature': 0.9, 'top_p': 0.95}). - :param kwargs: Extra keyword arguments. - """ - super().__init__(**kwargs) - - self.system_prompt = system_prompt or self.DEFAULT_SYSTEM_PROMPT - self.input_template = input_template or self.DEFAULT_INPUT_TEMPLATE - self.qa_pair_template = qa_pair_template or \ - self.DEFAULT_QA_PAIR_TEMPLATE - self.output_pattern = output_pattern or self.DEFAULT_OUTPUT_PATTERN - - self.enable_vllm = enable_vllm - model_params = model_params or {} - sampling_params = sampling_params or {} - - if enable_vllm: - assert torch.cuda.device_count() >= 1, 'must be executed in CUDA' - # cannot initialize vllm replicas on different GPUs - self.num_proc = 1 - if model_params.get('tensor_parallel_size') is None: - tensor_parallel_size = torch.cuda.device_count() - logger.info(f'Set tensor_parallel_size to \ - {tensor_parallel_size} for vllm.') - model_params['tensor_parallel_size'] = tensor_parallel_size - self.model_key = prepare_model( - model_type='vllm', - pretrained_model_name_or_path=hf_model, - **model_params) - self.sampling_params = vllm.SamplingParams(**sampling_params) - else: - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_model, - return_pipe=True, - **model_params) - self.sampling_params = sampling_params
- -
[docs] def build_input(self, sample): - qa_pair = self.qa_pair_template.format(sample[self.query_key], - sample[self.response_key]) - input_prompt = self.input_template.format(qa_pair) - return input_prompt
- -
[docs] def parse_output(self, raw_output): - logger.debug(raw_output) - match = re.match(self.output_pattern, raw_output, re.DOTALL) - if match: - return match.group(1).strip(), match.group(2).strip() - else: - return None, None
- -
[docs] def process_single(self, sample=None, rank=None): - model, _ = get_model(self.model_key, rank, self.use_cuda()) - - input_prompt = self.build_input(sample) - messages = [{ - 'role': 'system', - 'content': self.system_prompt - }, { - 'role': 'user', - 'content': input_prompt - }] - - if self.enable_vllm: - response = model.chat(messages, self.sampling_params) - output = response[0].outputs[0].text - else: - # model is pipe - response = model(messages, - return_full_text=False, - **self.sampling_params) - output = response[0]['generated_text'] - - parsed_q, parsed_a = self.parse_output(output) - if parsed_q: - sample[self.query_key] = parsed_q - if parsed_a: - sample[self.response_key] = parsed_a - - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_query_mapper.html b/_modules/data_juicer/ops/mapper/optimize_query_mapper.html deleted file mode 100644 index ab704fc5a..000000000 --- a/_modules/data_juicer/ops/mapper/optimize_query_mapper.html +++ /dev/null @@ -1,129 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.optimize_query_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.optimize_query_mapper

-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
-from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
-
-OP_NAME = 'optimize_query_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class OptimizeQueryMapper(OptimizeQAMapper): - """ - Mapper to optimize query in question-answer pairs. - """ - - DEFAULT_SYSTEM_PROMPT = '优化问答对中的【问题】,将其更加详细具体,但仍可以由原答案回答。只输出优化后的【问题】,不要输出多余内容。' # noqa: E501 - - _accelerator = 'cuda' - -
[docs] def parse_output(self, raw_output): - return raw_output.strip(), None
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/optimize_response_mapper.html b/_modules/data_juicer/ops/mapper/optimize_response_mapper.html deleted file mode 100644 index 22993ce81..000000000 --- a/_modules/data_juicer/ops/mapper/optimize_response_mapper.html +++ /dev/null @@ -1,129 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.optimize_response_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.optimize_response_mapper

-from data_juicer.ops.base_op import OPERATORS, UNFORKABLE
-from data_juicer.ops.mapper.optimize_qa_mapper import OptimizeQAMapper
-
-OP_NAME = 'optimize_response_mapper'
-
-
-# TODO: Extend LLM-based OPs into API-based implementation.
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -class OptimizeResponseMapper(OptimizeQAMapper): - """ - Mapper to optimize response in question-answer pairs. - """ - - DEFAULT_SYSTEM_PROMPT = '请优化问答对中的回答,将其更加详细具体,但仍可以回答原问题。只输出优化后的回答,不要输出多余内容。' - - _accelerator = 'cuda' - -
[docs] def parse_output(self, raw_output): - return None, raw_output.strip()
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html b/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html deleted file mode 100644 index 238ea4b1f..000000000 --- a/_modules/data_juicer/ops/mapper/punctuation_normalization_mapper.html +++ /dev/null @@ -1,174 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.punctuation_normalization_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.punctuation_normalization_mapper

-# Some code here has been modified from:
-# https://github.com/bigscience-workshop/data-preparation
-# --------------------------------------------------------
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('punctuation_normalization_mapper') -class PunctuationNormalizationMapper(Mapper): - """Mapper to normalize unicode punctuations to English punctuations in text - samples.""" - - _batched_op = True - -
[docs] def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.punctuation_unicode = { - ',': ',', - '。': '.', - '、': ',', - '„': '"', - '”': '"', - '“': '"', - '«': '"', - '»': '"', - '1': '"', - '」': '"', - '「': '"', - '《': '"', - '》': '"', - '´': "'", - '∶': ':', - ':': ':', - '?': '?', - '!': '!', - '(': '(', - ')': ')', - ';': ';', - '–': '-', - '—': ' - ', - '.': '. ', - '~': '~', - '’': "'", - '…': '...', - '━': '-', - '〈': '<', - '〉': '>', - '【': '[', - '】': ']', - '%': '%', - '►': '-', - }
- -
[docs] def process_batched(self, samples): - samples[self.text_key] = [ - ''.join([self.punctuation_unicode.get(c, c) for c in text]) - for text in samples[self.text_key] - ] - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html b/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html deleted file mode 100644 index 292e8843a..000000000 --- a/_modules/data_juicer/ops/mapper/remove_bibliography_mapper.html +++ /dev/null @@ -1,149 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_bibliography_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_bibliography_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
-# --------------------------------------------------------
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_bibliography_mapper') -class RemoveBibliographyMapper(Mapper): - """Mapper to remove bibliography at the end of documents in Latex - samples.""" - - _batched_op = True - -
[docs] def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pattern = r'(\\appendix|' - self.pattern += r'\\begin\{references\}|' - self.pattern += r'\\begin\{REFERENCES\}|' - self.pattern += r'\\begin\{thebibliography\}|' - self.pattern += r'\\bibliography\{.*\}' - self.pattern += r').*$'
- -
[docs] def process_batched(self, samples): - samples[self.text_key] = [ - re.sub(pattern=self.pattern, - repl=r'', - string=text, - flags=re.DOTALL) for text in samples[self.text_key] - ] - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html b/_modules/data_juicer/ops/mapper/remove_comments_mapper.html deleted file mode 100644 index 5e7c95de8..000000000 --- a/_modules/data_juicer/ops/mapper/remove_comments_mapper.html +++ /dev/null @@ -1,170 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_comments_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_comments_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
-# --------------------------------------------------------
-
-from typing import List, Union
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_comments_mapper') -class RemoveCommentsMapper(Mapper): - """ - Mapper to remove comments in different kinds of documents. - - Only support 'tex' for now. - """ - - _batched_op = True - -
[docs] def __init__(self, - doc_type: Union[str, List[str]] = 'tex', - inline: bool = True, - multiline: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param doc_type: Type of document to remove comments. - :param inline: Whether to remove inline comments. - :param multiline: Whether to remove multiline comments. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.doc_type = doc_type - self.inline = inline - self.multiline = multiline
- -
[docs] def process_batched(self, samples): - # TODO: remove different comments by sample type - - for idx, text in enumerate(samples[self.text_key]): - if self.inline: - # remove all in comments within a line - text = re.sub(pattern=r'[^\\]%.+$', - repl=r'', - string=text, - flags=re.MULTILINE) - - if self.multiline: - text = re.sub(pattern=r'(?m)^%.*\n?', - repl=r'', - string=text, - flags=re.MULTILINE) - - samples[self.text_key][idx] = text - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_header_mapper.html b/_modules/data_juicer/ops/mapper/remove_header_mapper.html deleted file mode 100644 index 823a87413..000000000 --- a/_modules/data_juicer/ops/mapper/remove_header_mapper.html +++ /dev/null @@ -1,161 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_header_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_header_mapper

-# Some code here has been modified from:
-# https://github.com/togethercomputer/RedPajama-Data/tree/rp_v1/
-# --------------------------------------------------------
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_header_mapper') -class RemoveHeaderMapper(Mapper): - """Mapper to remove headers at the beginning of documents in Latex - samples.""" - - _batched_op = True - -
[docs] def __init__(self, drop_no_head: bool = True, *args, **kwargs): - """ - Initialization method. - - :param drop_no_head: whether to drop sample texts without - headers. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pattern = r'^(.*?)(' - self.pattern += r'\\\bchapter\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bpart\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bsubsubsection\b\*?(?:\[(.*?)\])?\{(.*?)\}|' - self.pattern += r'\\\bparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}' - self.pattern += r'\\\bsubparagraph\b\*?(?:\[(.*?)\])?\{(.*?)\}' - self.pattern += r')' - - self.drop_no_head = drop_no_head
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - if not re.search(self.pattern, text, flags=re.DOTALL): - if self.drop_no_head: - text = '' - continue - text = re.sub(pattern=self.pattern, - repl=r'\2', - string=text, - flags=re.DOTALL) - - samples[self.text_key][idx] = text - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html b/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html deleted file mode 100644 index 4ff69cc37..000000000 --- a/_modules/data_juicer/ops/mapper/remove_long_words_mapper.html +++ /dev/null @@ -1,164 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_long_words_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_long_words_mapper

-# Some code here has been modified from:
-# https://huggingface.co/spaces/huggingface/text-data-filtering
-# --------------------------------------------------------
-
-import sys
-
-from ..base_op import OPERATORS, Mapper
-from ..common import (SPECIAL_CHARACTERS, merge_on_whitespace_tab_newline,
-                      split_on_newline_tab_whitespace, strip)
-
-
-
[docs]@OPERATORS.register_module('remove_long_words_mapper') -class RemoveLongWordsMapper(Mapper): - """Mapper to remove long words within a specific range.""" - - _batched_op = True - -
[docs] def __init__(self, - min_len: int = 1, - max_len: int = sys.maxsize, - *args, - **kwargs): - """ - Initialization method. - - :param min_len: The min mapper word length in this op, words - will be filtered if their length is below this parameter. - :param max_len: The max mapper word length in this op, words - will be filtered if their length exceeds this parameter. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_len = min_len - self.max_len = max_len
- -
[docs] def should_keep_long_word(self, word): - if self.min_len <= len(word) <= self.max_len: - return True - elif self.min_len <= len(strip(word, - SPECIAL_CHARACTERS)) <= self.max_len: - return True - else: - return False
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - sentences = split_on_newline_tab_whitespace(text) - sentences = [[[ - word for word in subsentence - if self.should_keep_long_word(word) - ] for subsentence in sentence] for sentence in sentences] - samples[self.text_key][idx] = merge_on_whitespace_tab_newline( - sentences) - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_non_chinese_character_mapper.html b/_modules/data_juicer/ops/mapper/remove_non_chinese_character_mapper.html deleted file mode 100644 index 90143339c..000000000 --- a/_modules/data_juicer/ops/mapper/remove_non_chinese_character_mapper.html +++ /dev/null @@ -1,156 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_non_chinese_character_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_non_chinese_character_mapper

-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_non_chinese_character_mapper') -class RemoveNonChineseCharacterlMapper(Mapper): - """Mapper to remove non chinese Character in text samples.""" - - _batched_op = True - -
[docs] def __init__(self, - keep_alphabet: bool = True, - keep_number: bool = True, - keep_punc: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param keep_alphabet: whether to keep alphabet - :param keep_number: whether to keep number - :param keep_punc: whether to keep punctuation - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pattern = u'[^\u4e00-\u9fa5' - if keep_alphabet: - self.pattern += u'A-Za-z' - if keep_number: - self.pattern += u'0-9' - if keep_punc: - self.pattern += u'., ,\\-。%《*》/•、&&(—)(+):?!!“”·]+' - else: - self.pattern += u']'
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - if not re.search(self.pattern, text, flags=re.DOTALL): - continue - - samples[self.text_key][idx] = re.sub(pattern=self.pattern, - repl=r'', - string=text, - flags=re.DOTALL) - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_repeat_sentences_mapper.html b/_modules/data_juicer/ops/mapper/remove_repeat_sentences_mapper.html deleted file mode 100644 index 85cce03db..000000000 --- a/_modules/data_juicer/ops/mapper/remove_repeat_sentences_mapper.html +++ /dev/null @@ -1,182 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_repeat_sentences_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_repeat_sentences_mapper

-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-def split_sentence(text):
-    text = re.sub('([.。!!?\?])([^’”])', r'\1\n\2', text)  # noqa
-    text = re.sub('(\.{6})([^’”])', r'\1\n\2', text)  # noqa
-    text = re.sub('(\…{2})([^’”])', r'\1\n\2', text)  # noqa
-    text = re.sub('([.。!!?\?\.{6}\…{2}][’”])([^’”])', r'\1\n\2', text)  # noqa
-    return text.split('\n')
-
-
-
[docs]@OPERATORS.register_module('remove_repeat_sentences_mapper') -class RemoveRepeatSentencesMapper(Mapper): - """Mapper to remove repeat sentences in text samples.""" - - _batched_op = True - -
[docs] def __init__(self, - lowercase: bool = False, - ignore_special_character: bool = True, - min_repeat_sentence_length: int = 2, - *args, - **kwargs): - """ - Initialization method. - - :param lowercase: Whether to convert sample text to lower case - :param ignore_special_character: Whether to ignore special - characters when judging repeated sentences. Special characters - are all characters except Chinese characters, letters and - numbers. - :param min_repeat_sentence_length: Sentences shorter than this - length will not be deduplicated. If ignore_special_character is - set to True, then special characters are not included in this - length. - :param args: extra args - :param kwargs: extra args - """ - - super().__init__(*args, **kwargs) - self.lowercase = lowercase - self.min_repeat_sentence_length = min_repeat_sentence_length - self.remove_regex = re.compile(r'[^a-zA-Z0-9\u4e00-\u9fa5\n\t ]' - ) if ignore_special_character else None
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - lines = [e for e in text.split('\n')] - new_lines = [] - hash_set = set([]) - for line in lines: - new_sent = '' - if line: - sentences = split_sentence(line) - for sentence in sentences: - copy = sentence.strip() - if self.lowercase: - copy = copy.lower() - if self.remove_regex: - copy = self.remove_regex.sub('', copy) - - if len(copy) < self.min_repeat_sentence_length: - new_sent += sentence - elif copy not in hash_set: - new_sent += sentence - hash_set.add(copy) - new_lines.append(new_sent) - - samples[self.text_key][idx] = '\n'.join(new_lines) - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html b/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html deleted file mode 100644 index 09aa7aeef..000000000 --- a/_modules/data_juicer/ops/mapper/remove_specific_chars_mapper.html +++ /dev/null @@ -1,152 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_specific_chars_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_specific_chars_mapper

-from typing import List, Union
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_specific_chars_mapper') -class RemoveSpecificCharsMapper(Mapper): - """Mapper to clean specific chars in text samples.""" - - _batched_op = True - -
[docs] def __init__(self, - chars_to_remove: Union[str, List[str]] = '◆●■►▼▲▴∆▻▷❖♡□', - *args, - **kwargs): - """ - Initialization method. - - :param chars_to_remove: a list or a string including all - characters that need to be removed from text. - :param args: extra args - :param kwargs: extra args - """ - - super().__init__(*args, **kwargs) - if chars_to_remove: - self.pattern = '[' + '|'.join(chars_to_remove) + ']' - else: - self.pattern = None
- -
[docs] def process_batched(self, samples): - if self.pattern is None: - return samples - - samples[self.text_key] = [ - re.sub(pattern=self.pattern, - repl=r'', - string=text, - flags=re.DOTALL) for text in samples[self.text_key] - ] - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html b/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html deleted file mode 100644 index 6eb1494aa..000000000 --- a/_modules/data_juicer/ops/mapper/remove_table_text_mapper.html +++ /dev/null @@ -1,154 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_table_text_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_table_text_mapper

-import regex as re
-from pydantic import Field
-from typing_extensions import Annotated
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('remove_table_text_mapper') -class RemoveTableTextMapper(Mapper): - """ - Mapper to remove table texts from text samples. - - Regular expression is used to remove tables in the range of column - number of tables. - """ - - _batched_op = True - -
[docs] def __init__(self, - min_col: Annotated[int, Field(ge=2, le=20)] = 2, - max_col: Annotated[int, Field(ge=2, le=20)] = 20, - *args, - **kwargs): - """ - Initialization method. - - :param min_col: The min number of columns of table to remove. - :param max_col: The max number of columns of table to remove. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.min_col = min_col - self.max_col = max_col - self.pattern = r'(?<=\n)((\S+?)([ |\t](\S+?)){%d}\n+){2,}'
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - for i in range(self.min_col - 1, self.max_col): - pattern = re.compile(self.pattern % i) - text = pattern.sub('', text) - - samples[self.text_key][idx] = text - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html b/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html deleted file mode 100644 index 8637514e5..000000000 --- a/_modules/data_juicer/ops/mapper/remove_words_with_incorrect_substrings_mapper.html +++ /dev/null @@ -1,184 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.remove_words_with_incorrect_substrings_mapper

-from typing import List, Optional
-
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-from ..common import (SPECIAL_CHARACTERS, get_words_from_document,
-                      merge_on_whitespace_tab_newline,
-                      split_on_newline_tab_whitespace, strip)
-
-OP_NAME = 'remove_words_with_incorrect_substrings_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class RemoveWordsWithIncorrectSubstringsMapper(Mapper): - """Mapper to remove words with incorrect substrings.""" - - _batched_op = True - -
[docs] def __init__(self, - lang: str = 'en', - tokenization: bool = False, - substrings: Optional[List[str]] = None, - *args, - **kwargs): - """ - Initialization method. - - :param lang: sample in which language - :param tokenization: whether to use model to tokenize documents - :param substrings: The incorrect substrings in words. - :param args: extra args - :param kwargs: extra args - """ - if substrings is None: - substrings = ['http', 'www', '.com', 'href', '//'] - super().__init__(*args, **kwargs) - self.tokenization = tokenization - self.substrings = substrings - self.lang = lang - if tokenization: - self.model_key = prepare_model(model_type='sentencepiece', - lang=lang)
- -
[docs] def should_keep_word_with_incorrect_substrings(self, word, substrings): - word = strip(word, SPECIAL_CHARACTERS) - should_keep = all([(i_substr not in word) for i_substr in substrings]) - return should_keep
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - if self.tokenization: - tokenizer = get_model(self.model_key) - sentences = get_words_from_document( - text, - token_func=tokenizer.encode_as_pieces - if tokenizer else None) - words = [ - word.replace('▁', '') for word in sentences - if self.should_keep_word_with_incorrect_substrings( - word.replace('▁', ''), self.substrings) - ] - if len(words) != len(sentences): - text = ''.join(words) - else: - sentences = split_on_newline_tab_whitespace(text) - sentences = [[[ - word for word in subsentence - if self.should_keep_word_with_incorrect_substrings( - word, self.substrings) - ] for subsentence in sentence] for sentence in sentences] - text = merge_on_whitespace_tab_newline(sentences) - - samples[self.text_key][idx] = text - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/replace_content_mapper.html b/_modules/data_juicer/ops/mapper/replace_content_mapper.html deleted file mode 100644 index f1be3e74e..000000000 --- a/_modules/data_juicer/ops/mapper/replace_content_mapper.html +++ /dev/null @@ -1,175 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.replace_content_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.replace_content_mapper

-from typing import List, Union
-
-import regex as re
-
-from ..base_op import OPERATORS, Mapper
-
-
-
[docs]@OPERATORS.register_module('replace_content_mapper') -class ReplaceContentMapper(Mapper): - """Mapper to replace all content in the text that matches - a specific regular expression pattern with a designated - replacement string.""" - - _batched_op = True - -
[docs] def __init__(self, - pattern: Union[str, List[str], None] = None, - repl: Union[str, List[str]] = '', - *args, - **kwargs): - """ - Initialization method. - - :param pattern: regular expression pattern(s) to search for within text - :param repl: replacement string(s), default is empty string - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.pattern = pattern - self.repl = repl - self.compiled_patterns = [] - if isinstance(pattern, str): - self.compiled_patterns.append(self._prepare_pattern(pattern)) - elif isinstance(pattern, list): - for p in pattern: - self.compiled_patterns.append(self._prepare_pattern(p))
- - def _prepare_pattern(self, pattern: str) -> re.Pattern: - """Prepare the regular expression pattern.""" - if ((pattern is not None and len(pattern) > 2) - and (pattern.startswith("r'") and pattern.endswith("'") - or pattern.startswith('r"') and pattern.endswith('"'))): - pattern = pattern[2:-1] - return re.compile(pattern, flags=re.DOTALL) - -
[docs] def process_batched(self, samples): - if self.pattern is None: - return samples - - for idx, text in enumerate(samples[self.text_key]): - for i, pattern in enumerate(self.compiled_patterns): - if isinstance(self.repl, list) and i < len(self.repl): - replacement = self.repl[i] - elif isinstance(self.repl, list) and i >= len(self.repl): - raise ValueError(f"pattern length: {len(self.pattern)} '" - f'must be equal to ' - f'repl length: {len(self.repl)}') - else: - replacement = self.repl - - text = pattern.sub(replacement, text) - - samples[self.text_key][idx] = text - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html b/_modules/data_juicer/ops/mapper/sentence_split_mapper.html deleted file mode 100644 index 74fc54c61..000000000 --- a/_modules/data_juicer/ops/mapper/sentence_split_mapper.html +++ /dev/null @@ -1,146 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.sentence_split_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.sentence_split_mapper

-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-from ..common import get_sentences_from_document
-
-OP_NAME = 'sentence_split_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class SentenceSplitMapper(Mapper): - """Mapper to split text samples to sentences.""" - - _batched_op = True - -
[docs] def __init__(self, lang: str = 'en', *args, **kwargs): - """ - Initialization method. - - :param lang: split sentence of text in which language. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.lang = lang - self.model_key = prepare_model(model_type='nltk', lang=lang)
- -
[docs] def process_batched(self, samples): - - nltk_model = get_model(self.model_key) - - samples[self.text_key] = [ - get_sentences_from_document( - text, model_func=nltk_model.tokenize if nltk_model else None) - for text in samples[self.text_key] - ] - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_audio_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_audio_mapper.html deleted file mode 100644 index 142739fa2..000000000 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_audio_mapper.html +++ /dev/null @@ -1,250 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_captioning_from_audio_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_captioning_from_audio_mapper

-import copy
-import os
-
-import regex as re
-
-from data_juicer.utils.lazy_loader import AUTOINSTALL
-from data_juicer.utils.mm_utils import SpecialTokens, extract_audio_from_video
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-
-NAME = 'video_captioning_from_audio_mapper'
-
-
-
[docs]@OPERATORS.register_module(NAME) -class VideoCaptioningFromAudioMapper(Mapper): - """Mapper to caption a video according to its audio streams based on - Qwen-Audio model. - """ - - _accelerator = 'cuda' - _batched_op = True - -
[docs] def __init__(self, keep_original_sample: bool = True, *args, **kwargs): - """ - Initialization method. - - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only captioned sample in the - final datasets and the original sample will be removed. It's True - in default. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - AUTOINSTALL.check([ - 'transformers', 'transformers_stream_generator', 'einops', - 'accelerate', 'tiktoken' - ]) - - self.keep_original_sample = keep_original_sample - self.extra_args = kwargs - - self._hf_qwen_audio = 'Qwen/Qwen-Audio' - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=self._hf_qwen_audio, - trust_remote_code=True, - ) - self.prompt = '<|startoftranscription|><|unkown|><|caption|>' \ - '<|unkown|><|notimestamps|><|wo_itn|>' - self.response_remove_pattern = re.compile(r'<\|.*?\|>')
- - def _process_single_sample(self, sample, rank=None): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - return [] - - # get paths of all video(s) - loaded_video_keys = sample[self.video_key] - - # get models - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - offset = 0 - captioned_sample = copy.deepcopy(sample) - # generate for each video chunk by chunk - captioned_texts = '' - left_video_keys = [] - for chunk in sample[self.text_key].split(SpecialTokens.eoc): - # skip empty chunks - if not chunk.strip(): - continue - - vid_count = chunk.count(SpecialTokens.video) - - captioned_text_list = [] - for video in loaded_video_keys[offset:offset + vid_count]: - # only extract audio for index 0 for now - _, _, valid_indexes = extract_audio_from_video( - video, video + '.mp3', stream_indexes=[0]) - if len(valid_indexes) == 0: - # there is no valid audio streams. Skip! - continue - extracted_audio_path = video + '_0.mp3' - query = f'<audio>{extracted_audio_path}</audio>{self.prompt}' - - # start to inference - audio_info = processor.process_audio(query) - inputs = processor(query, - return_tensors='pt', - audio_info=audio_info).to(model.device) - outputs = model.generate(**inputs, audio_info=audio_info) - response = processor.decode(outputs[0], - skip_special_tokens=True, - audio_info=audio_info) - # remove audio path - response = response.replace(extracted_audio_path, '').replace( - '<audio>', '').replace('</audio>', '') - response = self.response_remove_pattern.sub('', - response).strip() - if response == '': - # generate failure. Skip! - continue - captioned_text_list.append(f'{SpecialTokens.video} {response}') - left_video_keys.append(video) - # remove extracted audio files - os.remove(extracted_audio_path) - offset += vid_count - captioned_text = ''.join(captioned_text_list) - - # add special tokens - captioned_texts += f'{captioned_text}{SpecialTokens.eoc}' - - captioned_sample[self.text_key] = captioned_texts - captioned_sample[self.video_key] = left_video_keys - return [captioned_sample] - -
[docs] def process_batched(self, samples, rank=None): - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - samples_after_split = [] - # do split for each sample within the batch - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_split.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample, - rank=rank) - if len(generated_samples) != 0: - samples_after_split.extend(generated_samples) - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_split[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_split] - - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html deleted file mode 100644 index 95870bcaa..000000000 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_frames_mapper.html +++ /dev/null @@ -1,471 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_captioning_from_frames_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_captioning_from_frames_mapper

-# yapf: disable
-import copy
-import random
-from typing import Optional
-
-import numpy as np
-from loguru import logger
-from PIL import ImageOps
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
-                                        extract_key_frames,
-                                        extract_video_frames_uniformly,
-                                        insert_texts_after_placeholders,
-                                        load_data_with_context, load_video,
-                                        remove_non_special_tokens,
-                                        remove_special_tokens)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_VIDEOS
-
-simhash = LazyLoader('simhash', 'simhash')
-
-OP_NAME = 'video_captioning_from_frames_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoCaptioningFromFramesMapper(Mapper): - """Mapper to generate samples whose captions are generated based on - an image-to-text model and sampled video frames. Captions from different - frames will be concatenated to a single string.""" - - _accelerator = 'cuda' - _batched_op = True - -
[docs] def __init__( - self, - hf_img2seq: str = 'Salesforce/blip2-opt-2.7b', - trust_remote_code: bool = False, - caption_num: PositiveInt = 1, - keep_candidate_mode: str = 'random_any', - keep_original_sample: bool = True, - prompt: Optional[str] = None, - prompt_key: Optional[str] = None, - frame_sampling_method: str = 'all_keyframes', - frame_num: PositiveInt = 3, - horizontal_flip: bool = False, - vertical_flip: bool = False, - *args, - **kwargs, - ): - """ - Initialization method. - - :param hf_img2seq: model name on huggingface to generate caption - :param caption_num: how many candidate captions to generate - for each video - :param keep_candidate_mode: retain strategy for the generated - $caption_num$ candidates. - - 'random_any': Retain the random one from generated captions - - 'similar_one_simhash': Retain the generated one that is most - similar to the original caption - - 'all': Retain all generated captions by concatenation - - Note: - This is a batched_OP, whose input and output type are - both list. Suppose there are $N$ list of input samples, whose batch - size is $b$, and denote caption_num as $M$. - The number of total samples after generation is $2Nb$ when - keep_original_sample is True and $Nb$ when keep_original_sample is - False. For 'random_any' and 'similar_one_simhash' mode, - it's $(1+M)Nb$ for 'all' mode when keep_original_sample is True - and $MNb$ when keep_original_sample is False. - - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only generated captions in the - final datasets and the original captions will be removed. It's True - in default. - :param prompt: a string prompt to guide the generation of image-to-text - model for all samples globally. It's None in default, which means - no prompt provided. - :param prompt_key: the key name of fields in samples to store prompts - for each sample. It's used for set different prompts for different - samples. If it's none, use prompt in parameter "prompt". It's None - in default. - :param frame_sampling_method: sampling method of extracting frame - videos from the videos. Should be one of - ["all_keyframes", "uniform"]. - The former one extracts all key frames (the number - of which depends on the duration of the video) and the latter - one extract specified number of frames uniformly from the video. - Default: "all_keyframes". - :param frame_num: the number of frames to be extracted uniformly from - the video. Only works when frame_sampling_method is "uniform". If - it's 1, only the middle frame will be extracted. If it's 2, only - the first and the last frames will be extracted. If it's larger - than 2, in addition to the first and the last frames, other frames - will be extracted uniformly within the video duration. - :param horizontal_flip: flip frame video horizontally (left to right). - :param vertical_flip: flip frame video vertically (top to bottom). - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - - if keep_candidate_mode not in [ - 'random_any', 'similar_one_simhash', 'all' - ]: - raise ValueError( - f'Keep strategy [{keep_candidate_mode}] is not supported. ' - f'Can only be one of ' - f'["random_any", "similar_one_simhash", "all"].') - - if keep_candidate_mode in ['random_any', 'similar_one_simhash']: - self.num_newly_generated_samples = 1 - elif keep_candidate_mode in ['all']: - self.num_newly_generated_samples = caption_num - else: - self.num_newly_generated_samples = 0 - - # report a warning when both prompt and prompt_key are set - if prompt and prompt_key: - logger.warning( - 'Both the parameter `prompt` and `prompt_key` are ' - 'set. Data-Juicer will consider `prompt_key` first.') - - self.caption_num = caption_num - self.keep_candidate_mode = keep_candidate_mode - self.keep_original_sample = keep_original_sample - self.prompt = prompt - self.prompt_key = prompt_key - self.extra_args = kwargs - - if frame_sampling_method not in ['all_keyframes', 'uniform']: - raise ValueError( - f'Frame sampling method ' - f'[{frame_sampling_method}] is not supported. ' - f'Can only be one of ["all_keyframes", "uniform"].') - - self.horizontal_flip = horizontal_flip - self.vertical_flip = vertical_flip - self.frame_sampling_method = frame_sampling_method - self.frame_num = frame_num - - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=hf_img2seq, - trust_remote_code=trust_remote_code - )
- - def _process_single_sample(self, ori_sample, rank=None, context=False): - - # there is no videos in this sample - if self.video_key not in ori_sample or not ori_sample[self.video_key]: - return [] - - # the generated results - generated_samples = [ - copy.deepcopy(ori_sample) - for _ in range(self.num_newly_generated_samples) - ] - for generated_sample in generated_samples: - generated_sample[self.text_key] = '' - - # load videos - loaded_video_keys = ori_sample[self.video_key] - sample, videos = load_data_with_context(ori_sample, context, - loaded_video_keys, load_video) - - text = sample[self.text_key] - offset = 0 - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - for chunk in text.split(SpecialTokens.eoc): - - video_count = chunk.count(SpecialTokens.video) - - # no video or no text - if video_count == 0 or len(chunk.strip()) == 0: - continue - else: - text_with_only_special_tokens = remove_non_special_tokens( - chunk) - # generate candidate caption(s) in batch manner - generated_text_candidates_single_chunk = [ - [] for _ in range(self.caption_num) - ] - for video_key in loaded_video_keys[offset:offset + - video_count]: - video = videos[video_key] - video_frame_videos_chunk = [] - # extract frame videos - if self.frame_sampling_method == 'all_keyframes': - frames = extract_key_frames(video) - elif self.frame_sampling_method == 'uniform': - frames = extract_video_frames_uniformly( - video, self.frame_num) - else: - frames = [] - frame_videos = [frame.to_image() for frame in frames] - for frame in frame_videos: - if self.horizontal_flip: - frame = ImageOps.mirror(frame) - if self.vertical_flip: - frame = ImageOps.flip(frame) - video_frame_videos_chunk.append(frame) - - # construct prompts - if self.prompt_key and isinstance( - ori_sample[self.prompt_key], str): - # check prompt_key is not None, and it's a str - # in the sample - prompt_texts = [ori_sample[self.prompt_key] - ] * len(video_frame_videos_chunk) - elif self.prompt and isinstance(self.prompt, str): - # check prompt is not None, and it's a str - prompt_texts = [self.prompt - ] * len(video_frame_videos_chunk) - else: - prompt_texts = None - - inputs = processor( - text=prompt_texts, - images=video_frame_videos_chunk, - return_tensors='pt', - ).to(model.device) - for i in range(self.caption_num): - generated_ids = model.generate(**inputs, - max_new_tokens=128, - do_sample=True) - generated_text = processor.batch_decode( - generated_ids, skip_special_tokens=True) - generated_text_candidates_single_chunk[i] += [ - '. '.join([txt.strip() for txt in generated_text]) - ] - - # 3. insert a list of generated captions into the positions of - # subsequent placeholders in the original string - new_generated_text_all_videos = [ - [] for _ in range(self.num_newly_generated_samples) - ] - # new_generated_text_all_videos is a helper array, - # element [i][j] - # denotes the reduced $i$-th result for the $j$-th video - - # reduce the captions according to given mode video by video - for j in range(video_count): - new_generated_text_per_video = self._reduce_captions( - chunk, - [ - captions[j] for captions in - generated_text_candidates_single_chunk - ], - ) - assert self.num_newly_generated_samples == len( - new_generated_text_per_video) - for i in range(len(new_generated_text_per_video)): - new_generated_text_all_videos[i].append( - new_generated_text_per_video[i]) - - # insert the captions according to given mode - place_holders = [SpecialTokens.video] * video_count - for i in range(self.num_newly_generated_samples): - generated_text_per_chunk = insert_texts_after_placeholders( - original_string=text_with_only_special_tokens, - placeholders=place_holders, - new_texts=new_generated_text_all_videos[i], - ) - generated_samples[i][ - self. - text_key] += f'{generated_text_per_chunk}' \ - f'{SpecialTokens.eoc}' - - offset += video_count - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - return generated_samples - - def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): - generated_text_per_chunk = [] - if self.keep_candidate_mode == 'random_any': - generated_text_per_chunk.append( - random.choice(generated_text_candidates_single_chunk)) - elif self.keep_candidate_mode == 'all': - generated_text_per_chunk.extend( - generated_text_candidates_single_chunk) - elif self.keep_candidate_mode == 'similar_one_simhash': - from ..deduplicator.document_simhash_deduplicator import \ - DocumentSimhashDeduplicator - - ori_normal_text = remove_special_tokens(chunk) - # using a simhash OP to calculate their similarity - # NOTE: simhash is just one method to calculate the similarities - # between texts, but not the most accurate one. More methods (e.g. - # embedding-based, ...) will be added. - op_simhash = DocumentSimhashDeduplicator(window_size=2, - **self.extra_args) - ori_text_hash = np.uint64( - op_simhash.compute_hash({op_simhash.text_key: - ori_normal_text})[HashKeys.simhash]) - generated_text_hashes = [ - np.uint64( - op_simhash.compute_hash( - {op_simhash.text_key: - candidate_text})[HashKeys.simhash]) - for candidate_text in generated_text_candidates_single_chunk - ] - hamming_distances = [ - simhash.num_differing_bits(ori_text_hash, generated_text_hash) - for generated_text_hash in generated_text_hashes - ] - max_index = min(range(len(hamming_distances)), - key=hamming_distances.__getitem__) - generated_text_per_chunk.append( - generated_text_candidates_single_chunk[max_index]) - return generated_text_per_chunk - -
[docs] def process_batched(self, samples, rank=None, context=False): - """ - :param samples: - :return: - - Note: - This is a batched_OP, whose the input and output type are - both list. Suppose there are $N$ input sample list with batch - size as $b$, and denote caption_num as $M$. - the number of total samples after generation is $2Nb$ - for 'random_any' and 'similar_one' mode, - and $(1+M)Nb$ for 'all' mode. - """ - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - samples_after_generation = [] - # do generation for each sample within the batch - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_generation.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample, - rank=rank, - context=context) - if len(generated_samples) != 0: - samples_after_generation.extend(generated_samples) - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_generation[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_generation] - - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.html deleted file mode 100644 index c6b8e60d8..000000000 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_summarizer_mapper.html +++ /dev/null @@ -1,367 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_captioning_from_summarizer_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_captioning_from_summarizer_mapper

-import copy
-from typing import Dict, Optional
-
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.lazy_loader import AUTOINSTALL
-from data_juicer.utils.mm_utils import SpecialTokens, remove_special_tokens
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-
-NAME = 'video_captioning_from_summarizer_mapper'
-
-
-
[docs]@OPERATORS.register_module(NAME) -class VideoCaptioningFromSummarizerMapper(Mapper): - """ - Mapper to generate video captions by summarizing several kinds of generated - texts (captions from video/audio/frames, tags from audio/frames, ...) - """ - - _accelerator = 'cuda' - _batched_op = True - -
[docs] def __init__(self, - hf_summarizer: str = None, - trust_remote_code: bool = False, - consider_video_caption_from_video: bool = True, - consider_video_caption_from_audio: bool = True, - consider_video_caption_from_frames: bool = True, - consider_video_tags_from_audio: bool = True, - consider_video_tags_from_frames: bool = True, - vid_cap_from_vid_args: Optional[Dict] = None, - vid_cap_from_frm_args: Optional[Dict] = None, - vid_tag_from_aud_args: Optional[Dict] = None, - vid_tag_from_frm_args: Optional[Dict] = None, - keep_tag_num: PositiveInt = 5, - keep_original_sample: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param hf_summarizer: the summarizer model used to summarize texts - generated by other methods. - :param consider_video_caption_from_video: whether to consider the video - caption generated from video directly in the summarization process. - Default: True. - :param consider_video_caption_from_audio: whether to consider the video - caption generated from audio streams in the video in the - summarization process. Default: True. - :param consider_video_caption_from_frames: whether to consider the - video caption generated from sampled frames from the video in the - summarization process. Default: True. - :param consider_video_tags_from_audio: whether to consider the video - tags generated from audio streams in the video in the summarization - process. Default: True. - :param consider_video_tags_from_frames: whether to consider the video - tags generated from sampled frames from the video in the - summarization process. Default: True. - :param vid_cap_from_vid_args: the arg dict for video captioning from - video directly with keys are the arg names and values are the arg - values. Default: None. - :param vid_cap_from_frm_args: the arg dict for video captioning from - sampled frames from the video with keys are the arg names and - values are the arg values. Default: None. - :param vid_tag_from_aud_args: the arg dict for video tagging from audio - streams in the video with keys are the arg names and values are the - arg values. Default: None. - :param vid_tag_from_frm_args: the arg dict for video tagging from - sampled frames from the video with keys are the arg names and - values are the arg values. Default: None. - :param keep_tag_num: max number N of tags from sampled frames to keep. - Too many tags might bring negative influence to summarized text, so - we consider to only keep the N most frequent tags. Default: 5. - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only summarized captions in the - final datasets and the original captions will be removed. It's True - in default. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - AUTOINSTALL.check([ - 'torch', - 'transformers', - 'transformers_stream_generator', - 'einops', - 'accelerate', - 'tiktoken', # by audio caption - 'torchaudio', # by audio tag - ]) - - self.keep_original_sample = keep_original_sample - self.extra_args = kwargs - - # prepare summarizer - self._hf_summarizer = hf_summarizer if hf_summarizer else 'mrm8488/flan-t5-large-finetuned-openai-summarize_from_feedback' # noqa: E501 - self.model_key = prepare_model( - model_type='huggingface', - pretrained_model_name_or_path=self._hf_summarizer, - trust_remote_code=trust_remote_code) - - # prepare input texts ops - if vid_cap_from_vid_args is None: - vid_cap_from_vid_args = {} - if vid_cap_from_frm_args is None: - vid_cap_from_frm_args = {} - if vid_tag_from_aud_args is None: - vid_tag_from_aud_args = {} - if vid_tag_from_frm_args is None: - vid_tag_from_frm_args = {} - self.FIXED_ARGS = { - 'caption_num': 1, - 'keep_candidate_mode': 'random_any', - 'keep_original_sample': False, - } - self.cap_op_list = [] - self.tag_op_list = [] - if consider_video_caption_from_video: - from .video_captioning_from_video_mapper import \ - VideoCaptioningFromVideoMapper - self.cap_op_list.append( - VideoCaptioningFromVideoMapper(**self._prepare_op_args( - VideoCaptioningFromVideoMapper, vid_cap_from_vid_args))) - if consider_video_caption_from_audio: - from .video_captioning_from_audio_mapper import \ - VideoCaptioningFromAudioMapper - self.cap_op_list.append( - VideoCaptioningFromAudioMapper(**self._prepare_op_args( - VideoCaptioningFromAudioMapper, {}))) - if consider_video_caption_from_frames: - from .video_captioning_from_frames_mapper import \ - VideoCaptioningFromFramesMapper - self.cap_op_list.append( - VideoCaptioningFromFramesMapper(**self._prepare_op_args( - VideoCaptioningFromFramesMapper, vid_cap_from_frm_args))) - if consider_video_tags_from_audio: - from .video_tagging_from_audio_mapper import \ - VideoTaggingFromAudioMapper - self.tag_op_list.append( - VideoTaggingFromAudioMapper(**self._prepare_op_args( - VideoTaggingFromAudioMapper, vid_tag_from_aud_args))) - if consider_video_tags_from_frames: - from .video_tagging_from_frames_mapper import \ - VideoTaggingFromFramesMapper - self.tag_op_list.append( - VideoTaggingFromFramesMapper(**self._prepare_op_args( - VideoTaggingFromFramesMapper, vid_tag_from_frm_args))) - - self.keep_tag_num = keep_tag_num
- - def _prepare_op_args(self, op_class, args_dict): - required_args = set(op_class.__init__.__code__.co_varnames) - args_dict.update(self.FIXED_ARGS) - temp_args = copy.deepcopy(args_dict) - for key in temp_args: - if key not in required_args: - args_dict.pop(key) - args_dict['accelerator'] = self.accelerator - return args_dict - - def _process_single_sample(self, sample, rank=None): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - return [] - - # there is no activated ops - if len(self.cap_op_list) == 0 and len(self.tag_op_list) == 0: - return [] - - # get paths of all video(s) - loaded_video_keys = sample[self.video_key] - - # get models - model, tokenizer = get_model(self.model_key, rank, self.use_cuda()) - - captioned_sample = copy.deepcopy(sample) - # generate for each video chunk by chunk - captioned_texts = '' - offset = 0 - for chunk in sample[self.text_key].split(SpecialTokens.eoc): - # skip empty chunks - if not chunk.strip(): - continue - - vid_count = chunk.count(SpecialTokens.video) - - if vid_count == 0: - # add special tokens - captioned_texts += f'{chunk}{SpecialTokens.eoc}' - continue - - # make a temporary sample - temp_sample = { - self.text_key: chunk, - self.video_key: loaded_video_keys[offset:offset + vid_count], - } - - captioned_text_list = [] - # tag ops - for op in self.tag_op_list: - temp_sample = op.process(temp_sample, rank=rank) - if Fields.video_audio_tags in temp_sample: - captioned_text_list.extend( - temp_sample[Fields.video_audio_tags]) - if Fields.video_frame_tags in temp_sample: - for tag_list in temp_sample[Fields.video_frame_tags]: - captioned_text_list.extend(tag_list[self.keep_tag_num]) - # cap ops - for op in self.cap_op_list: - captioned_text_list.append( - remove_special_tokens( - op._process_single_sample(temp_sample, - rank=rank)[0]['text'])) - - # summarization - all_texts = ', '.join(captioned_text_list) - input_ids = tokenizer(all_texts, return_tensors='pt').input_ids.to( - model.device) - outputs = model.generate(input_ids, max_new_tokens=128) - summarized_text = tokenizer.decode(outputs[0], - skip_special_tokens=True) - - offset += vid_count - captioned_text = f'{SpecialTokens.video * vid_count} ' \ - f'{summarized_text}' - - # add special tokens - captioned_texts += f'{captioned_text}{SpecialTokens.eoc}' - - captioned_sample[self.text_key] = captioned_texts - return [captioned_sample] - -
[docs] def process_batched(self, samples, rank=None): - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - samples_after_split = [] - # do split for each sample within the batch - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_split.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample, - rank=rank) - if len(generated_samples) != 0: - samples_after_split.extend(generated_samples) - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_split[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_split] - - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html b/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html deleted file mode 100644 index 855c9305e..000000000 --- a/_modules/data_juicer/ops/mapper/video_captioning_from_video_mapper.html +++ /dev/null @@ -1,478 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_captioning_from_video_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_captioning_from_video_mapper

-# yapf: disable
-import copy
-import random
-from typing import Optional
-
-import numpy as np
-from loguru import logger
-from PIL import ImageOps
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import HashKeys
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
-                                        extract_key_frames,
-                                        extract_video_frames_uniformly,
-                                        insert_texts_after_placeholders,
-                                        load_data_with_context, load_video,
-                                        remove_non_special_tokens,
-                                        remove_special_tokens)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_VIDEOS
-
-simhash = LazyLoader('simhash', 'simhash')
-
-OP_NAME = 'video_captioning_from_video_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoCaptioningFromVideoMapper(Mapper): - """Mapper to generate samples whose captions are generated based on - a video-to-text model and sampled video frame.""" - - _accelerator = 'cuda' - _batched_op = True - -
[docs] def __init__( - self, - hf_video_blip: str = 'kpyu/video-blip-opt-2.7b-ego4d', - trust_remote_code: bool = False, - caption_num: PositiveInt = 1, - keep_candidate_mode: str = 'random_any', - keep_original_sample: bool = True, - prompt: Optional[str] = None, - prompt_key: Optional[str] = None, - frame_sampling_method: str = 'all_keyframes', - frame_num: PositiveInt = 3, - horizontal_flip: bool = False, - vertical_flip: bool = False, - *args, - **kwargs, - ): - """ - Initialization method. - - :param hf_video_blip: video-blip model name on huggingface - to generate caption - :param caption_num: how many candidate captions to generate - for each video - :param keep_candidate_mode: retain strategy for the generated - $caption_num$ candidates. - - 'random_any': Retain the random one from generated captions - - 'similar_one_simhash': Retain the generated one that is most - similar to the original caption - - 'all': Retain all generated captions by concatenation - - Note: - This is a batched_OP, whose input and output type are - both list. Suppose there are $N$ list of input samples, whose batch - size is $b$, and denote caption_num as $M$. - The number of total samples after generation is $2Nb$ when - keep_original_sample is True and $Nb$ when keep_original_sample is - False. For 'random_any' and 'similar_one_simhash' mode, - it's $(1+M)Nb$ for 'all' mode when keep_original_sample is True - and $MNb$ when keep_original_sample is False. - - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only generated captions in the - final datasets and the original captions will be removed. It's True - in default. - :param prompt: a string prompt to guide the generation of video-blip - model for all samples globally. It's None in default, which means - no prompt provided. - :param prompt_key: the key name of fields in samples to store prompts - for each sample. It's used for set different prompts for different - samples. If it's none, use prompt in parameter "prompt". It's None - in default. - :param frame_sampling_method: sampling method of extracting frame - videos from the videos. Should be one of - ["all_keyframes", "uniform"]. - The former one extracts all key frames (the number - of which depends on the duration of the video) and the latter - one extract specified number of frames uniformly from the video. - Default: "all_keyframes". - :param frame_num: the number of frames to be extracted uniformly from - the video. Only works when frame_sampling_method is "uniform". If - it's 1, only the middle frame will be extracted. If it's 2, only - the first and the last frames will be extracted. If it's larger - than 2, in addition to the first and the last frames, other frames - will be extracted uniformly within the video duration. - :param horizontal_flip: flip frame video horizontally (left to right). - :param vertical_flip: flip frame video vertically (top to bottom). - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - - if keep_candidate_mode not in [ - 'random_any', 'similar_one_simhash', 'all' - ]: - raise ValueError( - f'Keep strategy [{keep_candidate_mode}] is not supported. ' - f'Can only be one of ' - f'["random_any", "similar_one_simhash", "all"].') - - if keep_candidate_mode in ['random_any', 'similar_one_simhash']: - self.num_newly_generated_samples = 1 - elif keep_candidate_mode in ['all']: - self.num_newly_generated_samples = caption_num - else: - self.num_newly_generated_samples = 0 - - # report a warning when both prompt and prompt_key are set - if prompt and prompt_key: - logger.warning( - 'Both the parameter `prompt` and `prompt_key` are ' - 'set. Data-Juicer will consider `prompt_key` first.') - - self.caption_num = caption_num - self.keep_candidate_mode = keep_candidate_mode - self.keep_original_sample = keep_original_sample - self.prompt = prompt - self.prompt_key = prompt_key - self.extra_args = kwargs - - if frame_sampling_method not in ['all_keyframes', 'uniform']: - raise ValueError( - f'Frame sampling method ' - f'[{frame_sampling_method}] is not supported. ' - f'Can only be one of ["all_keyframes", "uniform"].') - - self.horizontal_flip = horizontal_flip - self.vertical_flip = vertical_flip - self.frame_sampling_method = frame_sampling_method - self.frame_num = frame_num - - self.model_key = prepare_model( - model_type='video_blip', - pretrained_model_name_or_path=hf_video_blip, - trust_remote_code=trust_remote_code - )
- - def _process_single_sample(self, ori_sample, rank=None, context=False): - - # there is no videos in this sample - if self.video_key not in ori_sample or not ori_sample[self.video_key]: - return [] - - # the generated results - generated_samples = [ - copy.deepcopy(ori_sample) - for _ in range(self.num_newly_generated_samples) - ] - for generated_sample in generated_samples: - generated_sample[self.text_key] = '' - - # load videos - loaded_video_keys = ori_sample[self.video_key] - sample, videos = load_data_with_context(ori_sample, context, - loaded_video_keys, load_video) - - text = sample[self.text_key] - offset = 0 - model, processor = get_model(self.model_key, rank, self.use_cuda()) - - for chunk in text.split(SpecialTokens.eoc): - - video_count = chunk.count(SpecialTokens.video) - - # no video or no text - if video_count == 0 or len(chunk) == 0: - continue - else: - text_with_only_special_tokens = remove_non_special_tokens( - chunk) - # generate candidate caption(s) in batch manner - generated_text_candidates_single_chunk = [ - [] for _ in range(self.caption_num) - ] - for video_key in loaded_video_keys[offset:offset + - video_count]: - video = videos[video_key] - video_frame_videos_chunk = [] - # extract frame videos - if self.frame_sampling_method == 'all_keyframes': - frames = extract_key_frames(video) - elif self.frame_sampling_method == 'uniform': - frames = extract_video_frames_uniformly( - video, self.frame_num) - else: - frames = [] - frame_videos = [frame.to_image() for frame in frames] - for video in frame_videos: - if self.horizontal_flip: - video = ImageOps.mirror(video) - if self.vertical_flip: - video = ImageOps.flip(video) - video_frame_videos_chunk.append(video) - - # construct prompts - if self.prompt_key and isinstance( - ori_sample[self.prompt_key], str): - # check prompt_key is not None, and it's a str - # in the sample - prompt_texts = [ori_sample[self.prompt_key]] - elif self.prompt and isinstance(self.prompt, str): - # check prompt is not None, and it's a str - prompt_texts = [self.prompt] - else: - prompt_texts = None - inputs = processor( - text=prompt_texts, - images=video_frame_videos_chunk, - return_tensors='pt', - truncation=True, - max_length=model.config.text_config. - max_position_embeddings, - padding=True, - ).to(model.device) - # tchw to bcthw - inputs['pixel_values'] = inputs.pixel_values.unsqueeze( - 0).permute(0, 2, 1, 3, 4) - for i in range(self.caption_num): - generated_ids = model.generate(**inputs, - num_beams=4, - max_new_tokens=128, - temperature=0.7, - top_p=0.9, - repetition_penalty=1.5, - do_sample=True) - generated_text = processor.batch_decode( - generated_ids, skip_special_tokens=True) - generated_text_candidates_single_chunk[ - i] += generated_text - - # 3. insert a list of generated captions into the positions of - # subsequent placeholders in the original string - new_generated_text_all_videos = [ - [] for _ in range(self.num_newly_generated_samples) - ] - # new_generated_text_all_videos is a helper array, - # element [i][j] - # denotes the reduced $i$-th result for the $j$-th video - - # reduce the captions according to given mode video by video - for j in range(video_count): - new_generated_text_per_video = self._reduce_captions( - chunk, - [ - captions[j] for captions in - generated_text_candidates_single_chunk - ], - ) - assert self.num_newly_generated_samples == len( - new_generated_text_per_video) - for i in range(len(new_generated_text_per_video)): - new_generated_text_all_videos[i].append( - new_generated_text_per_video[i]) - - # insert the captions according to given mode - place_holders = [SpecialTokens.video] * video_count - for i in range(self.num_newly_generated_samples): - generated_text_per_chunk = insert_texts_after_placeholders( - original_string=text_with_only_special_tokens, - placeholders=place_holders, - new_texts=new_generated_text_all_videos[i], - ) - generated_samples[i][ - self. - text_key] += f'{generated_text_per_chunk}' \ - f'{SpecialTokens.eoc}' - - offset += video_count - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - return generated_samples - - def _reduce_captions(self, chunk, generated_text_candidates_single_chunk): - generated_text_per_chunk = [] - if self.keep_candidate_mode == 'random_any': - generated_text_per_chunk.append( - random.choice(generated_text_candidates_single_chunk)) - elif self.keep_candidate_mode == 'all': - generated_text_per_chunk.extend( - generated_text_candidates_single_chunk) - elif self.keep_candidate_mode == 'similar_one_simhash': - from ..deduplicator.document_simhash_deduplicator import \ - DocumentSimhashDeduplicator - - ori_normal_text = remove_special_tokens(chunk) - # using a simhash OP to calculate their similarity - # NOTE: simhash is just one method to calculate the similarities - # between texts, but not the most accurate one. More methods (e.g. - # embedding-based, ...) will be added. - op_simhash = DocumentSimhashDeduplicator(window_size=2, - **self.extra_args) - ori_text_hash = np.uint64( - op_simhash.compute_hash({op_simhash.text_key: - ori_normal_text})[HashKeys.simhash]) - generated_text_hashes = [ - np.uint64( - op_simhash.compute_hash( - {op_simhash.text_key: - candidate_text})[HashKeys.simhash]) - for candidate_text in generated_text_candidates_single_chunk - ] - hamming_distances = [ - simhash.num_differing_bits(ori_text_hash, generated_text_hash) - for generated_text_hash in generated_text_hashes - ] - max_index = min(range(len(hamming_distances)), - key=hamming_distances.__getitem__) - generated_text_per_chunk.append( - generated_text_candidates_single_chunk[max_index]) - return generated_text_per_chunk - -
[docs] def process_batched(self, samples, rank=None, context=False): - """ - :param samples: - :return: - - Note: - This is a batched_OP, whose the input and output type are - both list. Suppose there are $N$ input sample list with batch - size as $b$, and denote caption_num as $M$. - the number of total samples after generation is $2Nb$ - for 'random_any' and 'similar_one' mode, - and $(1+M)Nb$ for 'all' mode. - """ - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - samples_after_generation = [] - # do generation for each sample within the batch - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_generation.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample, - rank=rank, - context=context) - if len(generated_samples) != 0: - samples_after_generation.extend(generated_samples) - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_generation[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_generation] - - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html b/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html deleted file mode 100644 index 78f4a322d..000000000 --- a/_modules/data_juicer/ops/mapper/video_face_blur_mapper.html +++ /dev/null @@ -1,244 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_face_blur_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_face_blur_mapper

-import os
-
-import av
-from PIL import ImageFilter
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (close_video, detect_faces,
-                                        load_data_with_context, load_video,
-                                        process_each_frame)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, UNFORKABLE, Mapper
-from ..op_fusion import LOADED_VIDEOS
-
-cv2 = LazyLoader('cv2', 'cv2')
-
-OP_NAME = 'video_face_blur_mapper'
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoFaceBlurMapper(Mapper): - """Mapper to blur faces detected in videos. - """ - - _default_kwargs = { - 'scaleFactor': 1.1, - 'minNeighbors': 3, - 'minSize': None, - 'maxSize': None, - } - -
[docs] def __init__(self, - cv_classifier: str = '', - blur_type: str = 'gaussian', - radius: float = 2, - *args, - **kwargs): - """ - Initialization method. - - :param cv_classifier: OpenCV classifier path for face detection. - By default, we will use 'haarcascade_frontalface_alt.xml'. - :param blur_type: Type of blur kernel, including - ['mean', 'box', 'gaussian']. - :param radius: Radius of blur kernel. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - if cv_classifier == '': - cv_classifier = os.path.join(cv2.data.haarcascades, - 'haarcascade_frontalface_alt.xml') - if blur_type not in ['mean', 'box', 'gaussian']: - raise ValueError( - f'Blur_type [{blur_type}] is not supported. ' - f'Can only be one of ["mean", "box", "gaussian"]. ') - if radius < 0: - raise ValueError('Radius must be >= 0. ') - - if blur_type == 'mean': - self.blur = ImageFilter.BLUR - elif blur_type == 'box': - self.blur = ImageFilter.BoxBlur(radius) - else: - self.blur = ImageFilter.GaussianBlur(radius) - - self.blur_type = blur_type - self.radius = radius - - self.extra_kwargs = self._default_kwargs - for key in kwargs: - if key in self.extra_kwargs: - self.extra_kwargs[key] = kwargs[key] - - self.model_key = prepare_model(model_type='opencv_classifier', - model_path=cv_classifier)
- -
[docs] def process_single(self, sample, context=False): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.source_file] = [] - return sample - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.video_key] - - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - model = get_model(self.model_key) - - def _blur_func(frame): - image = frame.to_image() - dets = detect_faces(image, model, **self.extra_kwargs) - if len(dets) > 0: - for (x, y, w, h) in dets: - box = (x, y, x + w, y + h) - blured_roi = image.crop(box).filter(self.blur) - image.paste(blured_roi, box) - frame = av.VideoFrame.from_image(image) - return frame - - processed_video_keys = {} - for video_key in loaded_video_keys: - # skip duplicate - if video_key in processed_video_keys: - continue - - video = videos[video_key] - blured_video_key = transfer_filename(video_key, OP_NAME, - **self._init_parameters) - output_video_key = process_each_frame(video, blured_video_key, - _blur_func) - processed_video_keys[video_key] = output_video_key - - if not context: - close_video(video) - - # when the file is modified, its source file needs to be updated. - for i, value in enumerate(loaded_video_keys): - if sample[Fields.source_file][i] != value: - if processed_video_keys[value] != value: - sample[Fields.source_file][i] = value - - sample[self.video_key] = [ - processed_video_keys[key] for key in loaded_video_keys - ] - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.html b/_modules/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.html deleted file mode 100644 index 8b96b01e7..000000000 --- a/_modules/data_juicer/ops/mapper/video_ffmpeg_wrapped_mapper.html +++ /dev/null @@ -1,195 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_ffmpeg_wrapped_mapper

-from typing import Dict, List, Optional
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.logger_utils import HiddenPrints
-
-from ..base_op import OPERATORS, Mapper
-
-with HiddenPrints():
-    ffmpeg = LazyLoader('ffmpeg', 'ffmpeg')
-
-OP_NAME = 'video_ffmpeg_wrapped_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class VideoFFmpegWrappedMapper(Mapper): - """Simple wrapper for FFmpeg video filters. - """ - -
[docs] def __init__( - self, - filter_name: Optional[str] = None, - filter_kwargs: Optional[Dict] = None, - global_args: Optional[List[str]] = None, - capture_stderr: bool = True, - overwrite_output: bool = True, - *args, - **kwargs, - ): - """ - Initialization method. - - :param filter_name: ffmpeg video filter name. - :param filter_kwargs: keyword-arguments passed to ffmpeg filter. - :param global_args: list-arguments passed to ffmpeg command-line. - :param capture_stderr: whether to capture stderr. - :param overwrite_output: whether to overwrite output file. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - self.filter_name = filter_name - self.filter_kwargs = filter_kwargs - self.global_args = global_args - self.capture_stderr = capture_stderr - self.overwrite_output = overwrite_output
- -
[docs] def process_single(self, sample): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.source_file] = [] - return sample - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.video_key] - - if self.filter_name is None: - return sample - - loaded_video_keys = sample[self.video_key] - processed = {} - for video_key in loaded_video_keys: - if video_key in processed: - continue - - output_key = transfer_filename(video_key, OP_NAME, - **self._init_parameters) - stream = (ffmpeg.input(video_key).filter( - self.filter_name, **self.filter_kwargs).output(output_key)) - if self.global_args is not None: - stream = stream.global_args(*self.global_args) - stream.run(capture_stderr=self.capture_stderr, - overwrite_output=self.overwrite_output) - processed[video_key] = output_key - - # when the file is modified, its source file needs to be updated. - for i, value in enumerate(loaded_video_keys): - if sample[Fields.source_file][i] != value: - if processed[value] != value: - sample[Fields.source_file][i] = value - - sample[self.video_key] = [processed[key] for key in loaded_video_keys] - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html b/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html deleted file mode 100644 index 393218641..000000000 --- a/_modules/data_juicer/ops/mapper/video_remove_watermark_mapper.html +++ /dev/null @@ -1,355 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_remove_watermark_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_remove_watermark_mapper

-import os
-from typing import List, Optional
-
-import av
-import numpy as np
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.logger_utils import HiddenPrints
-from data_juicer.utils.mm_utils import (close_video,
-                                        extract_video_frames_uniformly,
-                                        load_data_with_context, load_video,
-                                        parse_string_to_roi,
-                                        process_each_frame)
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_VIDEOS
-
-with HiddenPrints():
-    cv2 = LazyLoader('cv2', 'cv2')
-
-OP_NAME = 'video_remove_watermark_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoRemoveWatermarkMapper(Mapper): - """ - Remove the watermarks in videos given regions. - """ - -
[docs] def __init__(self, - roi_strings: List[str] = ['0,0,0.1,0.1'], - roi_type: str = 'ratio', - roi_key: Optional[str] = None, - frame_num: PositiveInt = 10, - min_frame_threshold: PositiveInt = 7, - detection_method: str = 'pixel_value', - *args, - **kwargs): - """ - Initialization method. - - :param roi_strings: a given list of regions the watermarks locate. - The format of each can be "x1, y1, x2, y2", "(x1, y1, x2, y2)", - or "[x1, y1, x2, y2]". - :param roi_type: the roi string type. When the type is 'pixel', (x1, - y1), (x2, y2) are the locations of pixels in the top left corner - and the bottom right corner respectively. If the roi_type is - 'ratio', the coordinates are normalized by wights and heights. - :param roi_key: the key name of fields in samples to store roi_strings - for each sample. It's used for set different rois for different - samples. If it's none, use rois in parameter "roi_strings". - It's None in default. - :param frame_num: the number of frames to be extracted uniformly from - the video to detect the pixels of watermark. - :param min_frame_threshold: a coodination is considered as the - location of a watermark pixel when it is that in no less - min_frame_threshold frames. - :param detection_method: the method to detect the pixels of watermark. - If it is 'pixel_value', we consider the distribution of pixel - value in each frame. If it is 'pixel_diversity', we will consider - the pixel diversity in different frames. The min_frame_threshold - is useless and frame_num must be greater than 1 in - 'pixel_diversity' mode. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - if roi_type not in ['ratio', 'pixel']: - raise ValueError(f'roi_type [{roi_type}]' - f' is not supported. ' - f"Can only be one of ['ratio', 'pixel']. ") - - if detection_method not in ['pixel_value', 'pixel_diversity']: - raise ValueError( - f'etection_method [{detection_method}]' - f' is not supported. ' - f"Can only be one of ['pixel_value', 'pixel_diversity']. ") - - if detection_method == 'pixel_diversity' and frame_num < 2: - raise ValueError( - "frame_num must be gteater than 1 in 'pixel_diversity' mode.") - - rois = [] - if roi_key is None: - for roi_string in roi_strings: - roi = parse_string_to_roi(roi_string, roi_type) - if roi is None: - raise ValueError( - 'The roi in roi_strings must be four no negative' - ' numbers in the format of "x1, y1, x2, y2", ' - '"(x1, y1, x2, y2)", or "[x1, y1, x2, y2]".') - rois.append(roi) - - self.roi_type = roi_type - self.rois = rois - self.roi_key = roi_key - self.frame_num = frame_num - self.min_frame_threshold = min_frame_threshold - self.detection_method = detection_method
- - def _detect_watermark_via_pixel_value(self, frames, rois): - - masks = [] - for frame in frames: - frame = frame.to_ndarray(format='bgr24') - mask = np.zeros_like(frame[:, :, 0], dtype=np.uint8) - for roi in rois: - # dimension of ndarray frame: height x width x channel - roi_frame = frame[roi[1]:roi[3], roi[0]:roi[2]] - gray_frame = cv2.cvtColor(roi_frame, cv2.COLOR_BGR2GRAY) - _, binary_frame = cv2.threshold( - gray_frame, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - - # assume the watermark is located in the box, so the pixel in - # the edge must be 0, if not, reverse binary_frame - edge_postive_num = (binary_frame[0] > - 0).sum() + (binary_frame[:, 0] > 0).sum() - total = binary_frame.shape[0] + binary_frame.shape[1] - if edge_postive_num * 2 > total: - binary_frame = ~binary_frame - - mask[roi[1]:roi[3], - roi[0]:roi[2]] = mask[roi[1]:roi[3], - roi[0]:roi[2]] | binary_frame - masks.append(mask) - final_mask = sum((mask == 255).astype(np.uint8) for mask in masks) - final_mask = np.where(final_mask >= self.min_frame_threshold, 255, 0) - final_mask = final_mask.astype(np.uint8) - return final_mask - - def _detect_watermark_via_pixel_diversity(self, frames, rois): - - mask = np.zeros((frames[0].height, frames[0].width), dtype=np.uint8) - frames = [frame.to_ndarray(format='bgr24') for frame in frames] - - for roi in rois: - roi_frames = [ - frame[roi[1]:roi[3], roi[0]:roi[2]] for frame in frames - ] - roi_frames = np.stack(roi_frames, axis=0) - pixel_diversity = roi_frames.std(axis=0) - pixel_diversity = pixel_diversity.sum(-1) - max_diversity = np.max(pixel_diversity) - min_diversity = np.min(pixel_diversity) - if max_diversity > min_diversity: - scaled_diversity = 255 * (pixel_diversity - min_diversity) / ( - max_diversity - min_diversity) - else: - scaled_diversity = np.zeros_like(pixel_diversity) - scaled_diversity = scaled_diversity.astype(np.uint8) - _, binary_frame = cv2.threshold( - scaled_diversity, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) - # the watermark pixels have less diversity - binary_frame = ~binary_frame - mask[roi[1]:roi[3], - roi[0]:roi[2]] = mask[roi[1]:roi[3], - roi[0]:roi[2]] | binary_frame - - return mask - - def _generate_watermark_mask(self, video, sample): - frames = extract_video_frames_uniformly(video, self.frame_num) - - if self.roi_key is not None: - roi_strings = sample[self.roi_key] - if isinstance(roi_strings, str): - roi_strings = [roi_strings] - rois = [ - parse_string_to_roi(roi_string, self.roi_type) - for roi_string in roi_strings - ] - rois = [roi for roi in rois if roi is not None] - else: - rois = self.rois - if self.roi_type == 'ratio': - rois = [ - tuple([ - int(roi[0] * frames[0].width), - int(roi[1] * frames[0].height), - int(roi[2] * frames[0].width), - int(roi[3] * frames[0].height) - ]) for roi in self.rois - ] - - if self.detection_method == 'pixel_value': - mask = self._detect_watermark_via_pixel_value(frames, rois) - else: - mask = self._detect_watermark_via_pixel_diversity(frames, rois) - - kernel = np.ones((5, 5), np.uint8) - return cv2.dilate(mask, kernel) - - def _clean_watermark(self, frame, watermark_mask): - np_frame = frame.to_ndarray(format='bgr24') - new_np_frame = cv2.inpaint(np_frame, watermark_mask, 3, cv2.INPAINT_NS) - return av.VideoFrame.from_ndarray(new_np_frame, format='bgr24') - -
[docs] def process_single(self, sample, context=False): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.source_file] = [] - return sample - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.video_key] - - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - for index, video_key in enumerate(loaded_video_keys): - video = videos[video_key] - cleaned_video_key = transfer_filename(video_key, OP_NAME, - **self._init_parameters) - - if (not os.path.exists(cleaned_video_key) - or cleaned_video_key not in loaded_video_keys): - watermark_mask = self._generate_watermark_mask(video, sample) - - def process_frame_func(frame): - return self._clean_watermark(frame, watermark_mask) - - cleaned_video_key = process_each_frame(video, - cleaned_video_key, - process_frame_func) - - loaded_video_keys[index] = cleaned_video_key - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - - # when the file is modified, its source file needs to be updated. - for i, value in enumerate(sample[self.video_key]): - if sample[Fields.source_file][i] != value: - if loaded_video_keys[i] != value: - sample[Fields.source_file][i] = value - - sample[self.video_key] = loaded_video_keys - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html b/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html deleted file mode 100644 index 69937ba59..000000000 --- a/_modules/data_juicer/ops/mapper/video_resize_aspect_ratio_mapper.html +++ /dev/null @@ -1,263 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_resize_aspect_ratio_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_resize_aspect_ratio_mapper

-import math
-import os
-from fractions import Fraction
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.logger_utils import HiddenPrints
-from data_juicer.utils.mm_utils import close_video, load_video
-
-from ..base_op import OPERATORS, Mapper
-
-with HiddenPrints():
-    ffmpeg = LazyLoader('ffmpeg', 'ffmpeg')
-
-OP_NAME = 'video_resize_aspect_ratio_mapper'
-
-
-def rescale(width, height, ori_ratio, min_ratio, max_ratio, strategy):
-
-    scaled_width = width
-    scaled_height = height
-    ori_ratio = Fraction(ori_ratio)
-    min_ratio = Fraction(min_ratio)
-    max_ratio = Fraction(max_ratio)
-    if ori_ratio < min_ratio:
-        if strategy == 'increase':
-            # increase width to meet the min ratio
-            scaled_width = math.ceil(height * min_ratio)
-            scaled_width += scaled_width % 2
-        elif strategy == 'decrease':
-            # decrease height to meet the min ratio
-            scaled_height = math.floor(width / min_ratio)
-            scaled_height -= scaled_height % 2
-
-    elif ori_ratio > max_ratio:
-        if strategy == 'increase':
-            # increase height to meet the max ratio
-            scaled_height = math.ceil(width / max_ratio)
-            scaled_height += scaled_height % 2
-
-        elif strategy == 'decrease':
-            # decrease width to meet the max ratio
-            scaled_width = math.floor(height * max_ratio)
-            scaled_width -= scaled_width % 2
-
-    assert Fraction(scaled_width, scaled_height) >= min_ratio
-    assert Fraction(scaled_width, scaled_height) <= max_ratio
-
-    scaled_width = max(2, scaled_width)
-    scaled_height = max(2, scaled_height)
-
-    return scaled_width, scaled_height
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class VideoResizeAspectRatioMapper(Mapper): - """Mapper to resize videos by aspect ratio. - AspectRatio = W / H. - """ - - STRATEGY = ['decrease', 'increase'] - -
[docs] def __init__( - self, - min_ratio: str = '9/21', - max_ratio: str = '21/9', - strategy: str = 'increase', - *args, - **kwargs, - ): - """ - Initialization method. - - :param min_ratio: The minimum aspect ratio to enforce videos with - an aspect ratio below `min_ratio` will be resized to match - this minimum ratio. The ratio should be provided as a string - in the format "9:21" or "9/21". - :param max_ratio: The maximum aspect ratio to enforce videos with - an aspect ratio above `max_ratio` will be resized to match - this maximum ratio. The ratio should be provided as a string - in the format "21:9" or "21/9". - :param strategy: The resizing strategy to apply when adjusting the - video dimensions. It can be either 'decrease' to reduce the - dimension or 'increase' to enlarge it. Accepted values are - ['decrease', 'increase']. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - strategy = strategy.lower() - if strategy not in self.STRATEGY: - raise ValueError( - f'force_original_aspect_ratio [{strategy}] is not supported. ' - f'Can only be one of {self.STRATEGY}. ') - - self.min_ratio = Fraction(str(min_ratio).replace(':', '/')) - self.max_ratio = Fraction(str(max_ratio).replace(':', '/')) - self.strategy = strategy
- -
[docs] def process_single(self, sample): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.source_file] = [] - return sample - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.video_key] - - loaded_video_keys = sample[self.video_key] - for index, video_key in enumerate(loaded_video_keys): - - container = load_video(video_key) - video = container.streams.video[0] - original_width = video.codec_context.width - original_height = video.codec_context.height - original_aspect_ratio = Fraction(original_width, original_height) - close_video(container) - - if (original_aspect_ratio >= self.min_ratio - and original_aspect_ratio <= self.max_ratio): - continue - - scaled_width, scaled_height = rescale( - original_width, - original_height, - original_aspect_ratio, - self.min_ratio, - self.max_ratio, - self.strategy, - ) - resized_video_key = transfer_filename(video_key, OP_NAME, - **self._init_parameters) - if (not os.path.exists(resized_video_key) - or resized_video_key not in loaded_video_keys): - args = ['-nostdin', '-v', 'quiet', '-y'] - stream = ffmpeg.input(video_key) - stream = stream.filter('scale', - width=scaled_width, - height=scaled_height) - stream = stream.output(resized_video_key).global_args(*args) - stream.run() - loaded_video_keys[index] = resized_video_key - - # when the file is modified, its source file needs to be updated. - for i, value in enumerate(sample[self.video_key]): - if sample[Fields.source_file][i] != value: - if loaded_video_keys[i] != value: - sample[Fields.source_file][i] = value - - sample[self.video_key] = loaded_video_keys - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html b/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html deleted file mode 100644 index b7f85457f..000000000 --- a/_modules/data_juicer/ops/mapper/video_resize_resolution_mapper.html +++ /dev/null @@ -1,287 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_resize_resolution_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_resize_resolution_mapper

-import math
-import os
-import sys
-
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import transfer_filename
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.logger_utils import HiddenPrints
-from data_juicer.utils.mm_utils import close_video, load_video
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_VIDEOS
-
-with HiddenPrints():
-    ffmpeg = LazyLoader('ffmpeg', 'ffmpeg')
-
-OP_NAME = 'video_resize_resolution_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoResizeResolutionMapper(Mapper): - """ - Mapper to resize videos resolution. We leave the super resolution - with deep learning for future works. - """ - -
[docs] def __init__(self, - min_width: int = 1, - max_width: int = sys.maxsize, - min_height: int = 1, - max_height: int = sys.maxsize, - force_original_aspect_ratio: str = 'disable', - force_divisible_by: PositiveInt = 2, - *args, - **kwargs): - """ - Initialization method. - - :param min_width: Videos with width less than 'min_width' will be - mapped to videos with equal or bigger width. - :param max_width: Videos with width more than 'max_width' will be - mapped to videos with equal of smaller width. - :param min_height: Videos with height less than 'min_height' will be - mapped to videos with equal or bigger height. - :param max_height: Videos with height more than 'max_height' will be - mapped to videos with equal or smaller height. - :param force_original_aspect_ratio: Enable decreasing or \ - increasing output video width or height if necessary \ - to keep the original aspect ratio, including ['disable', \ - 'decrease', 'increase']. - :param force_divisible_by: Ensures that both the output dimensions, \ - width and height, are divisible by the given integer when used \ - together with force_original_aspect_ratio, must be a positive \ - even number. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - force_original_aspect_ratio = force_original_aspect_ratio.lower() - - if force_original_aspect_ratio not in [ - 'disable', 'decrease', 'increase' - ]: - raise ValueError( - f'force_original_aspect_ratio [{force_original_aspect_ratio}]' - f' is not supported. ' - f"Can only be one of ['disable', 'decrease', 'increase']. ") - if (force_divisible_by <= 1 or force_divisible_by % 2 - == 1) and force_original_aspect_ratio != 'disable': - raise ValueError( - f'force_divisible_by [{force_divisible_by}] must be a positive' - f' even number. ') - - self.min_width = min_width - self.max_width = max_width - self.min_height = min_height - self.max_height = max_height - self.scale_method = 'scale' - self.force_original_aspect_ratio = force_original_aspect_ratio - self.force_divisible_by = force_divisible_by
- -
[docs] def process_single(self, sample, context=False): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.source_file] = [] - return sample - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.video_key] - - loaded_video_keys = sample[self.video_key] - - for index, video_key in enumerate(loaded_video_keys): - - container = load_video(video_key) - video = container.streams.video[0] - width = video.codec_context.width - height = video.codec_context.height - origin_ratio = width / height - close_video(container) - - if width >= self.min_width and width <= self.max_width and \ - height >= self.min_height and height <= self.max_height: - continue - - # keep the original aspect ratio as possible - if width < self.min_width: - height = self.min_width / origin_ratio - width = self.min_width - if width > self.max_width: - height = self.max_width / origin_ratio - width = self.max_width - if height < self.min_height: - width = self.min_height * origin_ratio - height = self.min_height - if height > self.max_height: - width = self.max_height * origin_ratio - height = self.max_height - - # the width and height of a video must be divisible by 2. - if self.force_original_aspect_ratio == 'disable': - force_divisible_by = 2 - else: - force_divisible_by = self.force_divisible_by - - # make sure in the range if possible - width = int(max(width, self.min_width)) - width = math.ceil(width / force_divisible_by) * force_divisible_by - width = int(min(width, self.max_width)) - width = int(width / force_divisible_by) * force_divisible_by - height = int(max(height, self.min_height)) - height = math.ceil( - height / force_divisible_by) * force_divisible_by - height = int(min(height, self.max_height)) - height = int(height / force_divisible_by) * force_divisible_by - - # keep the origin aspect ratio - if self.force_original_aspect_ratio == 'increase': - if width / height < origin_ratio: - width = height * origin_ratio - elif width / height > origin_ratio: - height = width / origin_ratio - elif self.force_original_aspect_ratio == 'decrease': - if width / height < origin_ratio: - height = width / origin_ratio - elif width / height > origin_ratio: - width = height * origin_ratio - width = int(round(width / force_divisible_by)) * force_divisible_by - height = int(round( - height / force_divisible_by)) * force_divisible_by - - # resize - resized_video_key = transfer_filename(video_key, OP_NAME, - **self._init_parameters) - if (not os.path.exists(resized_video_key) - or resized_video_key not in loaded_video_keys): - args = ['-nostdin', '-v', 'quiet', - '-y'] # close the ffmpeg log - stream = ffmpeg.input(video_key) - stream = stream.filter('scale', width=width, height=height) - stream = stream.output(resized_video_key).global_args(*args) - stream.run() - - loaded_video_keys[index] = resized_video_key - - # when the file is modified, its source file needs to be updated. - for i, value in enumerate(sample[self.video_key]): - if sample[Fields.source_file][i] != value: - if loaded_video_keys[i] != value: - sample[Fields.source_file][i] = value - - sample[self.video_key] = loaded_video_keys - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html deleted file mode 100644 index 67ad650c8..000000000 --- a/_modules/data_juicer/ops/mapper/video_split_by_duration_mapper.html +++ /dev/null @@ -1,275 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_split_by_duration_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_split_by_duration_mapper

-import copy
-import re
-
-import numpy as np
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import (add_suffix_to_filename,
-                                          transfer_filename)
-from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
-                                        cut_video_by_seconds,
-                                        get_video_duration, load_video)
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_VIDEOS
-
-
-def create_replacer(replacements):
-
-    def replacer(match):
-        return replacements.pop(0)
-
-    return replacer
-
-
-OP_NAME = 'video_split_by_duration_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoSplitByDurationMapper(Mapper): - """Mapper to split video by duration. - """ - - _batched_op = True - -
[docs] def __init__(self, - split_duration: float = 10, - min_last_split_duration: float = 0, - keep_original_sample: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param split_duration: duration of each video split in seconds. - :param min_last_split_duration: The minimum allowable duration in - seconds for the last video split. If the duration of the last - split is less than this value, it will be discarded. - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only cut sample in the - final datasets and the original sample will be removed. It's True - in default. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - self.split_duration = split_duration - self.min_last_split_duration = min_last_split_duration - self.keep_original_sample = keep_original_sample - self.extra_args = kwargs
- -
[docs] def split_videos_by_duration(self, video_key, container): - video_duration = get_video_duration(container) - timestamps = np.arange(0, video_duration, self.split_duration).tolist() - count = 0 - split_video_keys = [] - unique_video_key = transfer_filename(video_key, OP_NAME, - **self._init_parameters) - for i in range(1, len(timestamps)): - split_video_key = add_suffix_to_filename(unique_video_key, - f'_{count}') - if cut_video_by_seconds(container, split_video_key, - timestamps[i - 1], timestamps[i]): - split_video_keys.append(split_video_key) - count += 1 - - if video_duration - timestamps[-1] >= self.min_last_split_duration: - split_video_key = add_suffix_to_filename(unique_video_key, - f'_{count}') - - if cut_video_by_seconds(container, split_video_key, - timestamps[-1]): - split_video_keys.append(split_video_key) - return split_video_keys
- - def _process_single_sample(self, sample): - # there is no video in this sample - if self.video_key not in sample or sample[ - self.video_key] is None or len(sample[self.video_key]) == 0: - sample[Fields.source_file] = [] - return [] - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.video_key] - - # the split results - split_sample = copy.deepcopy(sample) - split_sample[self.text_key] = '' - split_sample[Fields.source_file] = [] - - # load all video(s) - loaded_video_keys = sample[self.video_key] - videos = {} - for loaded_video_key in loaded_video_keys: - if loaded_video_key not in videos: - # avoid loading the same videos - video = load_video(loaded_video_key) - videos[loaded_video_key] = video - - split_video_keys = [] - offset = 0 - # split each video chunk by chunk - for chunk in sample[self.text_key].split(SpecialTokens.eoc): - # skip empty chunks or contents after the last eoc token - if not chunk.strip(): - continue - else: - video_count = chunk.count(SpecialTokens.video) - place_holders = [] - for video_key in loaded_video_keys[offset:offset + - video_count]: - video = videos[video_key] - new_video_keys = self.split_videos_by_duration( - video_key, video) - close_video(video) - split_video_keys.extend(new_video_keys) - place_holders.append(SpecialTokens.video * - len(new_video_keys)) - split_sample[Fields.source_file].extend( - [video_key] * len(new_video_keys)) - - # insert the generated text according to given mode - replacer_function = create_replacer(place_holders) - new_split_text_per_chunk = re.sub(SpecialTokens.video, - replacer_function, chunk) - split_sample[ - self. - text_key] += f'{new_split_text_per_chunk}{SpecialTokens.eoc}' # noqa: E501 - offset += video_count - - split_sample[self.video_key] = split_video_keys - return [split_sample] - -
[docs] def process_batched(self, samples): - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - samples_after_split = [] - # do split for each sample within the batch - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_split.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample) - if len(generated_samples) != 0: - samples_after_split.extend(generated_samples) - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_split[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_split] - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html deleted file mode 100644 index 4b48c4d83..000000000 --- a/_modules/data_juicer/ops/mapper/video_split_by_key_frame_mapper.html +++ /dev/null @@ -1,258 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_split_by_key_frame_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_split_by_key_frame_mapper

-import copy
-import re
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import (add_suffix_to_filename,
-                                          transfer_filename)
-from data_juicer.utils.mm_utils import (SpecialTokens, close_video,
-                                        cut_video_by_seconds,
-                                        get_key_frame_seconds, load_video)
-
-from ..base_op import OPERATORS, Mapper
-from ..op_fusion import LOADED_VIDEOS
-
-
-def create_replacer(replacements):
-
-    def replacer(match):
-        return replacements.pop(0)
-
-    return replacer
-
-
-OP_NAME = 'video_split_by_key_frame_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoSplitByKeyFrameMapper(Mapper): - """Mapper to split video by key frame. - """ - - _batched_op = True - -
[docs] def __init__(self, keep_original_sample: bool = True, *args, **kwargs): - """ - Initialization method. - - :param keep_original_sample: whether to keep the original sample. If - it's set to False, there will be only split sample in the - final datasets and the original sample will be removed. It's True - in default. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - self.keep_original_sample = keep_original_sample - self.extra_args = kwargs
- -
[docs] def get_split_key_frame(self, video_key, container): - timestamps = get_key_frame_seconds(container) - - count = 0 - split_video_keys = [] - unique_video_key = transfer_filename(video_key, OP_NAME, - **self._init_parameters) - for i in range(1, len(timestamps)): - split_video_key = add_suffix_to_filename(unique_video_key, - f'_{count}') - if cut_video_by_seconds(container, split_video_key, - timestamps[i - 1], timestamps[i]): - split_video_keys.append(split_video_key) - count += 1 - - split_video_key = add_suffix_to_filename(unique_video_key, f'_{count}') - if cut_video_by_seconds(container, split_video_key, timestamps[-1]): - split_video_keys.append(split_video_key) - return split_video_keys
- - def _process_single_sample(self, sample): - # there is no video in this sample - if self.video_key not in sample or sample[ - self.video_key] is None or len(sample[self.video_key]) == 0: - sample[Fields.source_file] = [] - return [] - - if Fields.source_file not in sample or not sample[Fields.source_file]: - sample[Fields.source_file] = sample[self.video_key] - - # the split results - split_sample = copy.deepcopy(sample) - split_sample[self.text_key] = '' - split_sample[Fields.source_file] = [] - - # load all video(s) - loaded_video_keys = sample[self.video_key] - videos = {} - for loaded_video_key in loaded_video_keys: - if loaded_video_key not in videos: - # avoid loading the same videos - video = load_video(loaded_video_key) - videos[loaded_video_key] = video - - split_video_keys = [] - offset = 0 - # split each video chunk by chunk - for chunk in sample[self.text_key].split(SpecialTokens.eoc): - # skip empty chunks or contents after the last eoc token - if not chunk.strip(): - continue - else: - video_count = chunk.count(SpecialTokens.video) - place_holders = [] - for video_key in loaded_video_keys[offset:offset + - video_count]: - video = videos[video_key] - new_video_keys = self.get_split_key_frame(video_key, video) - close_video(video) - split_video_keys.extend(new_video_keys) - place_holders.append(SpecialTokens.video * - len(new_video_keys)) - split_sample[Fields.source_file].extend( - [video_key] * len(new_video_keys)) - - # insert the generated text according to given mode - replacer_function = create_replacer(place_holders) - new_split_text_per_chunk = re.sub(SpecialTokens.video, - replacer_function, chunk) - split_sample[ - self. - text_key] += f'{new_split_text_per_chunk}{SpecialTokens.eoc}' # noqa: E501 - offset += video_count - - split_sample[self.video_key] = split_video_keys - return [split_sample] - -
[docs] def process_batched(self, samples): - # reconstruct samples from "dict of lists" to "list of dicts" - reconstructed_samples = [] - for i in range(len(samples[self.text_key])): - reconstructed_samples.append( - {key: samples[key][i] - for key in samples}) - samples_after_split = [] - # do split for each sample within the batch - for ori_sample in reconstructed_samples: - if self.keep_original_sample: - samples_after_split.append(ori_sample) - generated_samples = self._process_single_sample(ori_sample) - if len(generated_samples) != 0: - samples_after_split.extend(generated_samples) - # reconstruct samples from "list of dicts" to "dict of lists" - keys = samples_after_split[0].keys() - res_samples = {} - for key in keys: - res_samples[key] = [s[key] for s in samples_after_split] - - return res_samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_split_by_scene_mapper.html b/_modules/data_juicer/ops/mapper/video_split_by_scene_mapper.html deleted file mode 100644 index 1929ffe42..000000000 --- a/_modules/data_juicer/ops/mapper/video_split_by_scene_mapper.html +++ /dev/null @@ -1,259 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_split_by_scene_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_split_by_scene_mapper

-import math
-import re
-from itertools import chain
-
-from pydantic import NonNegativeFloat, NonNegativeInt
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.file_utils import (add_suffix_to_filename,
-                                          transfer_filename)
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import SpecialTokens
-
-from ..base_op import OPERATORS, Mapper
-
-scenedetect = LazyLoader('scenedetect', 'scenedetect')
-
-OP_NAME = 'video_split_by_scene_mapper'
-
-
-def replace_func(match, scene_counts_iter):
-    try:
-        count = next(scene_counts_iter)
-        return SpecialTokens.video * count
-    except StopIteration:
-        return match.group(0)
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class VideoSplitBySceneMapper(Mapper): - """Mapper to cut videos into scene clips. - """ - - # Define shared detector keys and their properties - avaliable_detectors = { - 'ContentDetector': ['weights', 'luma_only', 'kernel_size'], - 'AdaptiveDetector': [ - 'window_width', 'min_content_val', 'weights', 'luma_only', - 'kernel_size', 'video_manager', 'min_delta_hsv' - ], - 'ThresholdDetector': - ['fade_bias', 'add_final_scene', 'method', 'block_size'] - } - -
[docs] def __init__(self, - detector: str = 'ContentDetector', - threshold: NonNegativeFloat = 27.0, - min_scene_len: NonNegativeInt = 15, - show_progress: bool = False, - *args, - **kwargs): - """ - Initialization method. - - :param detector: Algorithm from `scenedetect.detectors`. Should be one - of ['ContentDetector', 'ThresholdDetector', 'AdaptiveDetector`]. - :param threshold: Threshold passed to the detector. - :param min_scene_len: Minimum length of any scene. - :param show_progress: Whether to show progress from scenedetect. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self._init_parameters = self.remove_extra_parameters(locals()) - - if detector not in self.avaliable_detectors: - raise ValueError( - f'Scene detector {detector} is not supported. ' - f'Can only be one of {list(self.avaliable_detectors.keys())}') - - self.detector = detector - self.threshold = threshold - self.min_scene_len = min_scene_len - self.show_progress = show_progress - - # prepare detector args - avaliable_kwargs = self.avaliable_detectors[self.detector] - self.detector_class = getattr(scenedetect.detectors, self.detector) - self.detector_kwargs = { - key: kwargs[key] - for key in avaliable_kwargs if key in kwargs - }
- -
[docs] def process_single(self, sample, context=False): - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[Fields.source_file] = [] - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - output_video_keys = {} - scene_counts = {} - - for video_key in loaded_video_keys: - - # skip duplicate - if video_key in output_video_keys: - continue - - redirected_video_key = transfer_filename(video_key, OP_NAME, - **self._init_parameters) - output_template = add_suffix_to_filename(redirected_video_key, - '_$SCENE_NUMBER') - - # detect scenes - detector = self.detector_class(self.threshold, self.min_scene_len, - **self.detector_kwargs) - scene_list = scenedetect.detect(video_key, - detector, - show_progress=self.show_progress, - start_in_scene=True) - scene_counts[video_key] = len(scene_list) - - if len(scene_list) > 1: - # sync with split_video_ffmpeg internal - scene_num_format = f'%0{max(3, math.floor(math.log(len(scene_list), 10)) + 1)}d' # noqa: E501 - output_video_keys[video_key] = [ - output_template.replace('$SCENE_NUMBER', - scene_num_format % (i + 1)) - for i in range(len(scene_list)) - ] - # split video into clips - scenedetect.split_video_ffmpeg( - input_video_path=video_key, - scene_list=scene_list, - output_file_template=output_template, - show_progress=self.show_progress) - else: - output_video_keys[video_key] = [video_key] - - # replace splited video tokens - if self.text_key in sample: - scene_counts_iter = iter( - [scene_counts[key] for key in loaded_video_keys]) - updated_text = re.sub( - re.escape(SpecialTokens.video), - lambda match: replace_func(match, scene_counts_iter), - sample[self.text_key]) - sample[self.text_key] = updated_text - - # when the file is modified, its source file needs to be updated. - sample[Fields.source_file] = [] - for value in loaded_video_keys: - sample[Fields.source_file].extend([value] * - len(output_video_keys[value])) - - sample[self.video_key] = list( - chain.from_iterable( - [output_video_keys[key] for key in loaded_video_keys])) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_tagging_from_audio_mapper.html b/_modules/data_juicer/ops/mapper/video_tagging_from_audio_mapper.html deleted file mode 100644 index 29923c38e..000000000 --- a/_modules/data_juicer/ops/mapper/video_tagging_from_audio_mapper.html +++ /dev/null @@ -1,202 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_tagging_from_audio_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_tagging_from_audio_mapper

-import librosa
-import numpy as np
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.lazy_loader import AUTOINSTALL, LazyLoader
-from data_juicer.utils.mm_utils import extract_audio_from_video
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, Mapper
-
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'video_tagging_from_audio_mapper'
-
-
-
[docs]@OPERATORS.register_module(OP_NAME) -class VideoTaggingFromAudioMapper(Mapper): - """Mapper to generate video tags from audio streams extracted by video - using the Audio Spectrogram Transformer. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - hf_ast: str = 'MIT/ast-finetuned-audioset-10-10-0.4593', - trust_remote_code: bool = False, - tag_field_name: str = Fields.video_audio_tags, - *args, - **kwargs): - """ - Initialization method. - - :param hf_ast: path to the HF model to tag from audios. - :param trust_remote_code: whether to trust the remote code of HF models - :param tag_field_name: the field name to store the tags. It's - "__dj__video_audio_tags__" in default. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - AUTOINSTALL.check(['torchaudio']) - self.model_key = prepare_model(model_type='huggingface', - pretrained_model_name_or_path=hf_ast, - trust_remote_code=trust_remote_code) - self._model_sampling_rate = 16000 - self._no_audio_label = 'EMPTY' - - self.tag_field_name = tag_field_name
- -
[docs] def process_single(self, sample, rank=None): - # check if it's generated already - if self.tag_field_name in sample: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[self.tag_field_name] = np.array([], dtype=np.str_) - return sample - - # load video paths - loaded_video_keys = sample[self.video_key] - - model, feature_extractor = get_model(self.model_key, rank, - self.use_cuda()) - video_audio_tags = [] - for video_path in loaded_video_keys: - # only extract audio data and sr for index 0 for now - ys, srs, valid_indexes = extract_audio_from_video( - video_path, stream_indexes=[0]) - if len(valid_indexes) == 0: - # there is no valid audio streams. Skip! - video_audio_tags.append(self._no_audio_label) - continue - - # inference - y = ys[0] - sr = srs[0] - # check if it meets the sampling rate condition of the model - if sr != self._model_sampling_rate: - y = librosa.resample(y, - orig_sr=sr, - target_sr=self._model_sampling_rate) - sr = self._model_sampling_rate - inputs = feature_extractor(y, - sampling_rate=sr, - return_tensors='pt').to(model.device) - with torch.no_grad(): - logits = model(**inputs).logits - predicted_tag_id = torch.argmax(logits, dim=-1).item() - predicted_tag = model.config.id2label[predicted_tag_id] - video_audio_tags.append(predicted_tag) - sample[self.tag_field_name] = np.array(video_audio_tags, dtype=np.str_) - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html b/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html deleted file mode 100644 index 651df2556..000000000 --- a/_modules/data_juicer/ops/mapper/video_tagging_from_frames_mapper.html +++ /dev/null @@ -1,227 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.video_tagging_from_frames_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.video_tagging_from_frames_mapper

-from collections import Counter
-
-import numpy as np
-from pydantic import PositiveInt
-
-from data_juicer.utils.constant import Fields
-from data_juicer.utils.lazy_loader import LazyLoader
-from data_juicer.utils.mm_utils import (close_video, extract_key_frames,
-                                        extract_video_frames_uniformly,
-                                        load_data_with_context, load_video)
-from data_juicer.utils.model_utils import get_model, prepare_model
-
-from ..base_op import OPERATORS, UNFORKABLE, Mapper
-from ..op_fusion import LOADED_VIDEOS
-
-ram = LazyLoader('ram', 'ram')
-torch = LazyLoader('torch', 'torch')
-
-OP_NAME = 'video_tagging_from_frames_mapper'
-
-
-
[docs]@UNFORKABLE.register_module(OP_NAME) -@OPERATORS.register_module(OP_NAME) -@LOADED_VIDEOS.register_module(OP_NAME) -class VideoTaggingFromFramesMapper(Mapper): - """Mapper to generate video tags from frames extract by video. - """ - - _accelerator = 'cuda' - -
[docs] def __init__(self, - frame_sampling_method: str = 'all_keyframes', - frame_num: PositiveInt = 3, - tag_field_name: str = Fields.video_frame_tags, - *args, - **kwargs): - """ - Initialization method. - - :param frame_sampling_method: sampling method of extracting frame - images from the videos. Should be one of - ["all_keyframes", "uniform"]. - The former one extracts all key frames (the number of which depends - on the duration of the video) and the latter one extract specified - number of frames uniformly from the video. - Default: "all_keyframes". - :param frame_num: the number of frames to be extracted uniformly from - the video. Only works when frame_sampling_method is "uniform". If - it's 1, only the middle frame will be extracted. If it's 2, only - the first and the last frames will be extracted. If it's larger - than 2, in addition to the first and the last frames, other frames - will be extracted uniformly within the video duration. - :param tag_field_name: the field name to store the tags. It's - "__dj__video_frame_tags__" in default. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - if frame_sampling_method not in ['all_keyframes', 'uniform']: - raise ValueError( - f'Frame sampling method [{frame_sampling_method}] is not ' - f'supported. Can only be one of ["all_keyframes", "uniform"].') - self.model_key = prepare_model( - model_type='recognizeAnything', - pretrained_model_name_or_path='ram_plus_swin_large_14m.pth', - input_size=384) - self.frame_sampling_method = frame_sampling_method - self.frame_num = frame_num - self.transform = ram.get_transform(image_size=384) - - self.tag_field_name = tag_field_name
- -
[docs] def process_single(self, sample, rank=None, context=False): - # check if it's generated already - if self.tag_field_name in sample: - return sample - - # there is no video in this sample - if self.video_key not in sample or not sample[self.video_key]: - sample[self.tag_field_name] = np.array([[]], dtype=np.str_) - return sample - - # load videos - loaded_video_keys = sample[self.video_key] - sample, videos = load_data_with_context(sample, context, - loaded_video_keys, load_video) - - model = get_model(self.model_key, rank, self.use_cuda()) - video_tags = [] - for _, value in enumerate(loaded_video_keys): - video = videos[value] - - # extract frame images - if self.frame_sampling_method == 'all_keyframes': - frames = extract_key_frames(video) - elif self.frame_sampling_method == 'uniform': - frames = extract_video_frames_uniformly(video, self.frame_num) - else: - video_tags.append([]) - continue - - frame_tensor = torch.stack([ - self.transform(frame.to_image()) for frame in frames - ]).to(next(model.parameters()).device) - with torch.no_grad(): - tags, _ = model.generate_tag(frame_tensor) - - words = [word.strip() for tag in tags for word in tag.split('|')] - word_count = Counter(words) - sorted_word_list = [item for item, _ in word_count.most_common()] - video_tags.append(np.array(sorted_word_list, dtype=np.str_)) - - if not context: - for vid_key in videos: - close_video(videos[vid_key]) - - sample[self.tag_field_name] = video_tags - return sample
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html b/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html deleted file mode 100644 index 18de64751..000000000 --- a/_modules/data_juicer/ops/mapper/whitespace_normalization_mapper.html +++ /dev/null @@ -1,150 +0,0 @@ - - - - - - - - data_juicer.ops.mapper.whitespace_normalization_mapper — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.mapper.whitespace_normalization_mapper

-# Most of the code here has been modified from:
-# https://github.com/bigscience-workshop/data-preparation
-# --------------------------------------------------------
-
-from ..base_op import OPERATORS, Mapper
-from ..common.special_characters import VARIOUS_WHITESPACES
-
-
-
[docs]@OPERATORS.register_module('whitespace_normalization_mapper') -class WhitespaceNormalizationMapper(Mapper): - """ - Mapper to normalize different kinds of whitespaces to whitespace ' ' (0x20) - in text samples. - - Different kinds of whitespaces can be found here: - https://en.wikipedia.org/wiki/Whitespace_character - """ - - _batched_op = True - -
[docs] def __init__(self, *args, **kwargs): - """ - Initialization method. - - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs)
- -
[docs] def process_batched(self, samples): - for idx, text in enumerate(samples[self.text_key]): - # remove whitespaces before and after the main content - text = text.strip() - - # replace all kinds of whitespaces with ' ' - samples[self.text_key][idx] = ''.join([ - char if char not in VARIOUS_WHITESPACES else ' ' - for char in text - ]) - - return samples
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html b/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html deleted file mode 100644 index 71bd7e475..000000000 --- a/_modules/data_juicer/ops/selector/frequency_specified_field_selector.html +++ /dev/null @@ -1,199 +0,0 @@ - - - - - - - - data_juicer.ops.selector.frequency_specified_field_selector — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.selector.frequency_specified_field_selector

-import numbers
-from typing import Optional
-
-from pydantic import Field, PositiveInt
-from typing_extensions import Annotated
-
-from ..base_op import OPERATORS, Selector
-
-
-
[docs]@OPERATORS.register_module('frequency_specified_field_selector') -class FrequencySpecifiedFieldSelector(Selector): - """Selector to select samples based on the sorted frequency of specified - field.""" - -
[docs] def __init__(self, - field_key: str = '', - top_ratio: Optional[Annotated[float, - Field(ge=0, le=1)]] = None, - topk: Optional[PositiveInt] = None, - reverse: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Selector based on the specified value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param top_ratio: Ratio of selected top specified field value, - samples will be selected if their specified field values are - within this parameter. When both topk and top_ratio are set, - the value corresponding to the smaller number of samples - will be applied. - :param topk: Number of selected top specified field value, - samples will be selected if their specified field values are - within this parameter. When both topk and top_ratio are set, - the value corresponding to the smaller number of samples - will be applied. - :param reverse: Determine the sorting rule, if reverse=True, - then sort in descending order. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.top_ratio = top_ratio - self.topk = topk - self.reverse = reverse
- -
[docs] def process(self, dataset): - if len(dataset) <= 1 or not self.field_key: - return dataset - - field_keys = self.field_key.split('.') - assert field_keys[0] in dataset.features.keys( - ), "'{}' not in {}".format(field_keys[0], dataset.features.keys()) - - field_value_dict = {} - for i, item in enumerate(dataset[field_keys[0]]): - field_value = item - for key in field_keys[1:]: - assert key in field_value.keys(), "'{}' not in {}".format( - key, field_value.keys()) - field_value = field_value[key] - assert field_value is None or isinstance( - field_value, str) or isinstance( - field_value, numbers.Number - ), 'The {} item is not String, Numbers or NoneType'.format(i) - if field_value not in field_value_dict.keys(): - field_value_dict[field_value] = [i] - else: - field_value_dict[field_value].append(i) - - select_num = 0 - if not self.top_ratio: - if not self.topk: - return dataset - else: - select_num = self.topk - else: - select_num = self.top_ratio * len(field_value_dict) - if self.topk and self.topk < select_num: - select_num = self.topk - - select_index = sum( - sorted(field_value_dict.values(), - key=lambda x: len(x), - reverse=self.reverse)[:int(select_num)], []) - return dataset.select(select_index)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/selector/random_selector.html b/_modules/data_juicer/ops/selector/random_selector.html deleted file mode 100644 index a7a470613..000000000 --- a/_modules/data_juicer/ops/selector/random_selector.html +++ /dev/null @@ -1,162 +0,0 @@ - - - - - - - - data_juicer.ops.selector.random_selector — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.selector.random_selector

-from typing import Optional
-
-from pydantic import Field, PositiveInt
-from typing_extensions import Annotated
-
-from data_juicer.format.mixture_formatter import MixtureFormatter
-
-from ..base_op import OPERATORS, Selector
-
-
-
[docs]@OPERATORS.register_module('random_selector') -class RandomSelector(Selector): - """Selector to random select samples. """ - -
[docs] def __init__(self, - select_ratio: Optional[Annotated[float, - Field(ge=0, le=1)]] = None, - select_num: PositiveInt = None, - *args, - **kwargs): - """ - Initialization method. - - :param select_ratio: The ratio to select. When both - select_ratio and select_num are set, the value corresponding - to the smaller number of samples will be applied. - :param select_num: The number of samples to select. When both - select_ratio and select_num are set, the value corresponding - to the smaller number of samples will be applied. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.select_ratio = select_ratio - self.select_num = select_num
- -
[docs] def process(self, dataset): - if len(dataset) <= 1: - return dataset - - if self.select_ratio is None and self.select_num is None: - return dataset - - select_num = 0 - if not self.select_ratio: - select_num = self.select_num - else: - select_num = int(self.select_ratio * len(dataset)) - if self.select_num and self.select_num < select_num: - select_num = self.select_num - - return MixtureFormatter.random_sample(dataset, - sample_number=select_num)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/selector/range_specified_field_selector.html b/_modules/data_juicer/ops/selector/range_specified_field_selector.html deleted file mode 100644 index e87358b0d..000000000 --- a/_modules/data_juicer/ops/selector/range_specified_field_selector.html +++ /dev/null @@ -1,223 +0,0 @@ - - - - - - - - data_juicer.ops.selector.range_specified_field_selector — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.selector.range_specified_field_selector

-import heapq
-from typing import Optional
-
-from pydantic import Field, PositiveInt
-from typing_extensions import Annotated
-
-from data_juicer.utils.common_utils import stats_to_number
-
-from ..base_op import OPERATORS, Selector
-
-
-
[docs]@OPERATORS.register_module('range_specified_field_selector') -class RangeSpecifiedFieldSelector(Selector): - """Selector to select a range of samples based on the sorted - specified field value from smallest to largest. """ - -
[docs] def __init__( - self, - field_key: str = '', - lower_percentile: Optional[Annotated[float, - Field(ge=0, le=1)]] = None, - upper_percentile: Optional[Annotated[float, - Field(ge=0, le=1)]] = None, - lower_rank: Optional[PositiveInt] = None, - upper_rank: Optional[PositiveInt] = None, - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Selector based on the specified value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param lower_percentile: The lower bound of the percentile to - be sample, samples will be selected if their specified field - values are greater than this lower bound. When both - lower_percentile and lower_rank are set, the value corresponding - to the larger number of samples will be applied. - :param upper_percentile: The upper bound of the percentile to - be sample, samples will be selected if their specified field - values are less or equal to the upper bound. When both - upper_percentile and upper_rank are set, the value corresponding - to the smaller number of samples will be applied. - :param lower_rank: The lower bound of the rank to be sample, - samples will be selected if their specified field values are - greater than this lower bound. When both lower_percentile and - lower_rank are set, the value corresponding to the larger number - of samples will be applied. - :param upper_rank: The upper bound of the rank to be sample, - samples will be selected if their specified field values are - less or equal to the upper bound. When both upper_percentile and - upper_rank are set, the value corresponding to the smaller number - of samples will be applied. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.lower_percentile = lower_percentile - self.upper_percentile = upper_percentile - self.lower_rank = lower_rank - self.upper_rank = upper_rank
- -
[docs] def process(self, dataset): - if len(dataset) <= 1 or not self.field_key: - return dataset - - if self.lower_percentile is None and self.lower_rank is None: - return dataset - if self.upper_percentile is None and self.upper_rank is None: - return dataset - - lower_bound, upper_bound = 0, len(dataset) - if self.lower_percentile is not None: - lower_bound = int(self.lower_percentile * len(dataset)) - if self.lower_rank is not None: - lower_bound = max(lower_bound, self.lower_rank) - if self.upper_percentile is not None: - upper_bound = int(self.upper_percentile * len(dataset)) - if self.upper_rank is not None: - upper_bound = min(upper_bound, self.upper_rank) - upper_bound = max(lower_bound, upper_bound) - - field_keys = self.field_key.split('.') - assert field_keys[0] in dataset.features.keys( - ), "'{}' not in {}".format(field_keys[0], dataset.features.keys()) - - def get_field_value_list(cur_dataset, field_keys): - if len(field_keys) == 1: - field_value_list = cur_dataset[field_keys[0]] - else: - field_value_list = [] - for item in cur_dataset[field_keys[0]]: - field_value = item - for key in field_keys[1:]: - assert key in field_value.keys( - ), "'{}' not in {}".format(key, field_value.keys()) - field_value = field_value[key] - field_value_list.append(field_value) - field_value_list = [stats_to_number(s) for s in field_value_list] - return field_value_list - - field_value_list = get_field_value_list(dataset, field_keys) - select_index = heapq.nsmallest(int(upper_bound), range(len(dataset)), - field_value_list.__getitem__) - sub_dataset = dataset.select(select_index) - - field_value_list = get_field_value_list(sub_dataset, field_keys) - select_index = heapq.nlargest(int(upper_bound - lower_bound), - range(len(sub_dataset)), - field_value_list.__getitem__) - - return sub_dataset.select(select_index)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html b/_modules/data_juicer/ops/selector/topk_specified_field_selector.html deleted file mode 100644 index ee857f9c5..000000000 --- a/_modules/data_juicer/ops/selector/topk_specified_field_selector.html +++ /dev/null @@ -1,201 +0,0 @@ - - - - - - - - data_juicer.ops.selector.topk_specified_field_selector — data_juicer 0.2.0 documentation - - - - - - - - - - - - - -
- - -
- -
-
-
- -
-
-
-
- -

Source code for data_juicer.ops.selector.topk_specified_field_selector

-import heapq
-from typing import Optional
-
-from pydantic import Field, PositiveInt
-from typing_extensions import Annotated
-
-from data_juicer.utils.common_utils import stats_to_number
-
-from ..base_op import OPERATORS, Selector
-
-
-
[docs]@OPERATORS.register_module('topk_specified_field_selector') -class TopkSpecifiedFieldSelector(Selector): - """Selector to select top samples based on the sorted specified field - value.""" - -
[docs] def __init__(self, - field_key: str = '', - top_ratio: Optional[Annotated[float, - Field(ge=0, le=1)]] = None, - topk: Optional[PositiveInt] = None, - reverse: bool = True, - *args, - **kwargs): - """ - Initialization method. - - :param field_key: Selector based on the specified value - corresponding to the target key. The target key - corresponding to multi-level field information need to be - separated by '.'. - :param top_ratio: Ratio of selected top samples, samples will be - selected if their specified field values are within this - parameter. When both topk and top_ratio are set, the value - corresponding to the smaller number of samples will be - applied. - :param topk: Number of selected top sample, samples will be - selected if their specified field values are within this - parameter. When both topk and top_ratio are set, the value - corresponding to the smaller number of samples will be - applied. - :param reverse: Determine the sorting rule, if reverse=True, - then sort in descending order. - :param args: extra args - :param kwargs: extra args - """ - super().__init__(*args, **kwargs) - self.field_key = field_key - self.top_ratio = top_ratio - self.topk = topk - self.reverse = reverse
- -
[docs] def process(self, dataset): - if len(dataset) <= 1 or not self.field_key: - return dataset - - select_num = 0 - if not self.top_ratio: - if not self.topk: - return dataset - else: - select_num = self.topk - else: - select_num = self.top_ratio * len(dataset) - if self.topk and self.topk < select_num: - select_num = self.topk - - field_keys = self.field_key.split('.') - assert field_keys[0] in dataset.features.keys( - ), "'{}' not in {}".format(field_keys[0], dataset.features.keys()) - - if len(field_keys) == 1: - field_value_list = dataset[field_keys[0]] - else: - field_value_list = [] - for item in dataset[field_keys[0]]: - field_value = item - for key in field_keys[1:]: - assert key in field_value.keys(), "'{}' not in {}".format( - key, field_value.keys()) - field_value = field_value[key] - field_value_list.append( - stats_to_number(field_value, self.reverse)) - - if self.reverse: - select_index = heapq.nlargest(int(select_num), range(len(dataset)), - field_value_list.__getitem__) - else: - select_index = heapq.nsmallest(int(select_num), - range(len(dataset)), - field_value_list.__getitem__) - return dataset.select(select_index)
-
- -
-
- -
-
-
-
- - - - \ No newline at end of file diff --git a/_modules/index.html b/_modules/index.html index 5f51e35da..3f5e03bef 100644 --- a/_modules/index.html +++ b/_modules/index.html @@ -76,17 +76,7 @@

All modules for which code is available

@@ -83,196 +78,8 @@
-
-

data_juicer.analysis

-
-
-class data_juicer.analysis.ColumnWiseAnalysis(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]
-

Bases: object

-

Apply analysis on each column of stats respectively.

-
-
-__init__(dataset, output_path, overall_result=None, save_stats_in_one_file=True)[source]
-

Initialization method

-
-
Parameters:
-
    -
  • dataset – the dataset to be analyzed

  • -
  • output_path – path to store the analysis results

  • -
  • overall_result – optional precomputed overall stats result

  • -
  • save_stats_in_one_file – whether save all analysis figures of all -stats into one image file

  • -
-
-
-
- -
-
-analyze(show_percentiles=False, show=False, skip_export=False)[source]
-

Apply analysis and draw the analysis figure for stats.

-
-
Parameters:
-
    -
  • show_percentiles – whether to show the percentile line in -each sub-figure. If it’s true, there will be several red -lines to indicate the quantiles of the stats distributions

  • -
  • show – whether to show in a single window after drawing

  • -
  • skip_export – whether save the results into disk

  • -
-
-
Returns:
-

-
-
-
- -
-
-draw_hist(ax, data, save_path, percentiles=None, show=False)[source]
-

Draw the histogram for the data.

-
-
Parameters:
-
    -
  • ax – the axes to draw

  • -
  • data – data to draw

  • -
  • save_path – the path to save the histogram figure

  • -
  • percentiles – the overall analysis result of the data -including percentile information

  • -
  • show – whether to show in a single window after drawing

  • -
-
-
Returns:
-

-
-
-
- -
-
-draw_box(ax, data, save_path, percentiles=None, show=False)[source]
-

Draw the box plot for the data.

-
-
Parameters:
-
    -
  • ax – the axes to draw

  • -
  • data – data to draw

  • -
  • save_path – the path to save the box figure

  • -
  • percentiles – the overall analysis result of the data -including percentile information

  • -
  • show – whether to show in a single window after drawing

  • -
-
-
Returns:
-

-
-
-
- -
- -
-
-class data_juicer.analysis.DiversityAnalysis(dataset, output_path, lang_or_model='en')[source]
-

Bases: object

-

Apply diversity analysis for each sample and get an overall analysis -result.

-
-
-__init__(dataset, output_path, lang_or_model='en')[source]
-

Initialization method :param dataset: the dataset to be analyzed -:param output_path: path to store the analysis results :param -lang_or_model: the diversity model or a specific language used to load -the diversity model.

-
- -
-
-compute(lang_or_model=None, column_name='text')[source]
-

Apply lexical tree analysis on each sample.

-
-
Parameters:
-
    -
  • lang_or_model – the diversity model or a specific language -used to load the diversity model

  • -
  • column_name – the name of column to be analyzed

  • -
-
-
Returns:
-

the analysis result.

-
-
-
- -
-
-analyze(lang_or_model=None, column_name='text', postproc_func=<function get_diversity>, **postproc_kwarg)[source]
-

Apply diversity analysis on the whole dataset.

-
-
Parameters:
-
    -
  • lang_or_model – the diversity model or a specific language -used to load the diversity model

  • -
  • column_name – the name of column to be analyzed

  • -
  • postproc_func – function to analyze diversity. In default, -it’s function get_diversity

  • -
  • postproc_kwarg – arguments of the postproc_func

  • -
-
-
Returns:
-

-
-
-
- -
- -
-
-class data_juicer.analysis.OverallAnalysis(dataset, output_path)[source]
-

Bases: object

-

Apply analysis on the overall stats, including mean, std, quantiles, -etc.

-
-
-__init__(dataset, output_path)[source]
-

Initialization method.

-
-
Parameters:
-
    -
  • dataset – the dataset to be analyzed

  • -
  • output_path – path to store the analysis results.

  • -
-
-
-
- -
-
-refine_single_column(col)[source]
-
- -
-
-analyze(percentiles=[], num_proc=1, skip_export=False)[source]
-

Apply overall analysis on the whole dataset based on the describe -method of pandas.

-
-
Parameters:
-
    -
  • percentiles – percentiles to analyze

  • -
  • num_proc – number of processes to analyze the dataset

  • -
  • skip_export – whether export the results to disk

  • -
-
-
Returns:
-

the overall analysis result.

-
-
-
- -
- +
+

data_juicer.analysis

diff --git a/data_juicer.core.html b/data_juicer.core.html index f0f47f61e..1b45f9e54 100644 --- a/data_juicer.core.html +++ b/data_juicer.core.html @@ -42,16 +42,7 @@